0.1.5 float16

This commit is contained in:
rob tillaart 2021-12-02 20:27:32 +01:00
parent 3598a42f1d
commit 2464abf608
13 changed files with 309 additions and 194 deletions

View File

@ -21,22 +21,23 @@ The library implements the **Printable** interface so one can directly print the
float16 values in any stream e.g. Serial.
The primary usage of the float16 data type is to efficiently store and transport
a floating point number. As it is only 2 bytes where float and double have typical
4 and 8, gains can be made at the price of range and precision.
a floating point number. As it uses only 2 bytes where float and double have typical
4 and 8 bytes, gains can be made at the price of range and precision.
## Specifications
| attribute | value | notes |
|:----------|:-------------|:-------------|
| Size | 2 bytes | |
| sign | 1 bit | |
| mantissa | 11 bit | ~ 3 digits |
| exponent | 4 bit | |
| minimum | 1.0009765625 | 1 + 2^10 |
| maximum | 65504 | |
| | | |
| attribute | value | notes |
|:----------|:-------------|:--------|
| size | 2 bytes | layout s eeeee mmmmmmmmmm
| sign | 1 bit |
| exponent | 5 bit |
| mantissa | 11 bit | ~ 3 digits
| minimum | 5.96046 E8 | smallest positive number.
| | 1.0009765625 | 1 + 2^10 = smallest nr larger than 1.
| maximum | 65504 |
| | |
## Interface
@ -52,72 +53,73 @@ to elaborate
#### Conversion
- **double toDouble(void)** convert to double (or float)
- **double toDouble(void)** convert to double (or float).
- **uint16_t getBinary()** get the 2 byte binary representation.
- **void setBinary(uint16_t u)** set the 2 bytes binary representation.
- **size_t printTo(Print& p) const** Printable interface.
- **void setDecimals(uint8_t d)** idem, used for printTo.
- **uint8_t getDecimals()** idem.
Note the setDecimals takes one byte per object which is not efficient for arrays.
Note the setDecimals takes one byte per object which is not efficient for arrays of float16.
See array example for efficient storage using set/getBinary() functions.
#### Compare
to elaborate
Standard compare functions. Since 0.1.5 these are quite optimized,
so it is fast to compare e.g. 2 measurements.
- **bool operator == (const float16& f)**
- **bool operator != (const float16& f)**
- **bool operator > (const float16& f)**
- **bool operator >= (const float16& f)**
- **bool operator < (const float16& f)**
- **bool operator <= (const float16& f)**
#### Math (basic)
Math is done by converting to double, do the math and convert back.
These operators are added for convenience only.
Not planned to optimize these.
- **float16 operator + (const float16& f)**
- **float16 operator - (const float16& f)**
- **float16 operator \* (const float16& f)**
- **float16 operator / (const float16& f)**
- **float16& operator += (const float16& f)**
- **float16& operator -= (const float16& f)**
- **float16& operator \*= (const float16& f)**
- **float16& operator /= (const float16& f)**
negation operator.
- **float16 operator - ()** fast negation.
- **int sign()** returns 1 == positive, 0 == zero, -1 == negative.
- **bool isZero()** returns true if zero. slightly faster than **sign()**.
- **bool isInf()** returns true if value is (-)infinite.
## Notes
#### comparison functions
First version of inequality operations are implemented by converting data to double and compare those.
The strategy is to get these working first and optionally optimize them later.
## TODO (future)
to get focus on getting things done...
#### 0.1.4
the following should work:
- update documentation
- positive numbers
- negative numbers
- infinity
- rounding to zero (e.g. 1e-30)
- array of numbers.
- unit tests of the above..
#### 0.1.5
- update documentation
- comparison operators
- unit tests of the above..
## Future
#### 0.1.6
- update documentation
- get basic math working (+-*/)
- isNan()
- isINF()
- abs()
- sgn()
- unit tests of the above..
- update documentation.
- unit tests of the above.
- isNan().
#### later
- update documentation
- get basic math II working += -= *= /=
- divide by zero errors.
- f16tof32() + f32tof16()
- rewrite toDouble with bit magic
- ...
- update documentation.
- error handling.
- divide by zero errors.
- look for optimizations.
- rewrite **f16tof32()** with bit magic.
- add storage example - with SD card, FRAM or EEPROM
- add communication example - serial or Ethernet?

View File

@ -6,27 +6,27 @@
// DATE: 2015-03-11
// URL: https://github.com/RobTillaart/float16
//
// Released to the public domain
//
/*
0 01111 0000000000 = 1
0 01111 0000000001 = 1 + 210 = 1.0009765625 (next smallest float after 1)
1 10000 0000000000 = 2
SIGN EXP MANTISSA
0 01111 0000000000 = 1
0 01111 0000000001 = 1 + 210 = 1.0009765625 (next smallest float after 1)
1 10000 0000000000 = 2
0 11110 1111111111 = 65504 (max half precision)
0 11110 1111111111 = 65504 (max half precision)
0 00001 0000000000 = 214 6.10352 × 105 (minimum positive normal)
0 00000 1111111111 = 214 - 224 6.09756 × 105 (maximum subnormal)
0 00000 0000000001 = 224 5.96046 × 108 (minimum positive subnormal)
0 00001 0000000000 = 214 6.10352 × 105 (minimum positive normal)
0 00000 1111111111 = 214 - 224 6.09756 × 105 (maximum subnormal)
0 00000 0000000001 = 224 5.96046 × 108 (minimum positive subnormal)
0 00000 0000000000 = 0
1 00000 0000000000 = 0
0 00000 0000000000 = 0
1 00000 0000000000 = 0
0 11111 0000000000 = infinity
1 11111 0000000000 = infinity
0 11111 0000000000 = infinity
1 11111 0000000000 = infinity
0 01101 0101010101 = 0.333251953125 1/3
0 01101 0101010101 = 0.333251953125 1/3
*/
#include "float16.h"
@ -62,8 +62,8 @@ void test_constructors()
Serial.println("\ntest_constructors:");
float16 a;
Serial.println(a.toDouble(), 9);
Serial.println(a.getBinary(), HEX);
Serial.println(a.getBinary(), HEX);
float16 b = 6;
Serial.println(b.toDouble(), 9);
Serial.println(b.getBinary(), HEX);
@ -144,7 +144,7 @@ void test_numbers()
Serial.println("** OVERFLOW **");
float16 f(1000000.0);
Serial.println(f.toDouble(), 9);
Serial.println("** UNDERFLOW **");
float16 g(1 / 1000000.0);
Serial.println(g.toDouble(), 9);

View File

@ -7,6 +7,7 @@
// URL: https://github.com/RobTillaart/float16
//
#include "float16.h"
float16 X;
@ -19,11 +20,11 @@ void setup()
Serial.println(__FILE__);
Serial.print("FLOAT16_LIB_VERSION: ");
Serial.println(FLOAT16_LIB_VERSION);
Serial.println("\nStart ");
float f;
for (uint16_t n = 0; n < 65535; n++)
// dump all possible values
for (uint16_t n = 0; n < 65535; n++)
{
f = X.f16tof32(n);
Serial.print(n);

View File

@ -77,7 +77,7 @@ void test_1()
Serial.print('\t');
float current = f16.toDouble();
Serial.print(current, 8);
if (prev > current)
if (prev > current) // numbers should be increasing.
{
Serial.print("\t\tERROR");
errors++;
@ -107,7 +107,7 @@ void test_1()
Serial.print('\t');
float current = f16.toDouble();
Serial.print(current, 8);
if (prev < current)
if (prev < current) // negative numbers should be decreasing.
{
Serial.print("\t\tERROR");
errors++;

View File

@ -22,9 +22,10 @@ void setup()
Serial.println(__FILE__);
Serial.print("FLOAT16_LIB_VERSION: ");
Serial.println(FLOAT16_LIB_VERSION);
Serial.println("\nStart ");
Serial.println();
// simulate temperature with random numbers
// simulate temperature sensor with random numbers
for (uint32_t n = 0; n < 10; n++)
{
temperature[n] = (random(1000) - 300) * 0.01;

View File

@ -17,7 +17,7 @@ void setup()
Serial.println(__FILE__);
Serial.print("FLOAT16_LIB_VERSION: ");
Serial.println(FLOAT16_LIB_VERSION);
Serial.println("\nStart ");
Serial.println();
for( int i = -10; i < 2; i++)

View File

@ -24,9 +24,9 @@ void setup()
Serial.println(FLOAT16_LIB_VERSION);
Serial.println();
f = random(1000000) * 0.001;
// CONSTRUCTORS
Serial.println("CONSTRUCTORS");
f = random(1000000) * 0.001;
start = micros();
float16 f16(f);
stop = micros();
@ -41,8 +41,10 @@ void setup()
Serial.print("a = b: \t");
Serial.println(stop - start);
delay(10);
Serial.println();
// CONVERSION
Serial.println("CONVERSION");
start = micros();
f = f16.toDouble();
stop = micros();
@ -52,7 +54,7 @@ void setup()
Serial.println();
// COMPARE
Serial.println("COMPARE");
f17 = f16.toDouble() + 1;
start = micros();
@ -96,6 +98,85 @@ void setup()
Serial.print("compare > : \t");
Serial.println(stop - start);
delay(10);
Serial.println();
Serial.println("MATH I");
float16 f18;
start = micros();
f18 = f16 + f17;
stop = micros();
Serial.print("math + : \t");
Serial.println(stop - start);
delay(10);
// Serial.println(f16);
// Serial.println(f17);
// Serial.println(f18);
start = micros();
f18 = f16 - f17;
stop = micros();
Serial.print("math - : \t");
Serial.println(stop - start);
delay(10);
start = micros();
f18 = f16 * f17;
stop = micros();
Serial.print("math * : \t");
Serial.println(stop - start);
delay(10);
start = micros();
f18 = f16 + f17;
stop = micros();
Serial.print("math / : \t");
Serial.println(stop - start);
delay(10);
Serial.println();
Serial.println("MATH II");
start = micros();
f18 += f16;
stop = micros();
Serial.print("math += : \t");
Serial.println(stop - start);
delay(10);
start = micros();
f18 -= f16;
stop = micros();
Serial.print("math -= : \t");
Serial.println(stop - start);
delay(10);
start = micros();
f18 *= f16;
stop = micros();
Serial.print("math *= : \t");
Serial.println(stop - start);
delay(10);
start = micros();
f18 /= f16;
stop = micros();
Serial.print("math /= : \t");
Serial.println(stop - start);
delay(10);
Serial.println();
Serial.println(f16);
Serial.println("MATH III - negation");
start = micros();
f18 = -f16;
stop = micros();
Serial.print("negation : \t");
Serial.println(stop - start);
delay(10);
Serial.println();
Serial.println(f18);
Serial.println("\ndone");
}

View File

@ -18,7 +18,6 @@ void setup()
Serial.println(__FILE__);
Serial.print("FLOAT16_LIB_VERSION: ");
Serial.println(FLOAT16_LIB_VERSION);
Serial.println("\nStart ");
for (uint32_t n = 1; n < 65536; n *= 2)
{

View File

@ -31,7 +31,6 @@ void setup()
Serial.println(__FILE__);
Serial.print("FLOAT16_LIB_VERSION: ");
Serial.println(FLOAT16_LIB_VERSION);
Serial.println("\nStart ");
f16.setDecimals(6);
@ -46,7 +45,6 @@ void setup()
Serial.print("\t");
Serial.println();
}
Serial.println();
Serial.println();

View File

@ -22,146 +22,191 @@
// CONSTRUCTOR
float16::float16(double f)
{
n = f32tof16(f);
_value = f32tof16(f);
}
// PRINTING
size_t float16::printTo(Print& p) const
{
double d = this->f16tof32(n);
return p.print(d, _decimals);
double d = this->f16tof32(_value);
return p.print(d, _decimals);
};
double float16::toDouble() const
{
return f16tof32(n);
return f16tof32(_value);
}
// NEGATE
float16 float16::operator - ()
{
return float16( -f16tof32(n) );
}
// bool float16::isNaN();
// bool float16::isInf();
//////////////////////////////////////////////////////////
//
// EQUALITIES
//
bool float16::operator == (const float16 &f)
{
return (n == f.n);
return (_value == f._value);
}
bool float16::operator != (const float16 &f)
{
return (n != f.n);
return (_value != f._value);
}
bool float16::operator > (const float16 &c)
bool float16::operator > (const float16 &f)
{
return this->toDouble() > c.toDouble();
if ((_value & 0x8000) && ( f._value & 0x8000)) return _value < f._value;
if (_value & 0x8000) return false;
if (f._value & 0x8000) return true;
return _value > f._value;
}
bool float16::operator >= (const float16 &c)
bool float16::operator >= (const float16 &f)
{
return this->toDouble() >= c.toDouble();
if ((_value & 0x8000) && (f._value & 0x8000)) return _value <= f._value;
if (_value & 0x8000) return false;
if (f._value & 0x8000) return true;
return _value >= f._value;
}
bool float16::operator < (const float16 &c)
bool float16::operator < (const float16 &f)
{
return this->toDouble() < c.toDouble();
if ((_value & 0x8000) && (f._value & 0x8000)) return _value > f._value;
if (_value & 0x8000) return true;
if (f._value & 0x8000) return false;
return _value < f._value;
}
bool float16::operator <= (const float16 &c)
bool float16::operator <= (const float16 &f)
{
return this->toDouble() <= c.toDouble();
if ((_value & 0x8000) && (f._value & 0x8000)) return _value >= f._value;
if (_value & 0x8000) return true;
if (f._value & 0x8000) return false;
return _value <= f._value;
}
/*
// BASIC MATH I
float16 float16::operator + (const float16 &c)
//////////////////////////////////////////////////////////
//
// NEGATION
//
float16 float16::operator - ()
{
return (float16(this->toDouble() + c.toDouble());
float16 f16;
f16.setBinary(_value ^ 0x8000);
return f16;
}
float16 float16::operator - (const float16 &c)
//////////////////////////////////////////////////////////
//
// MATH
//
float16 float16::operator + (const float16 &f)
{
return (float16(this->toDouble() - c.toDouble());
return float16(this->toDouble() + f.toDouble());
}
float16 float16::operator * (const float16 &c)
float16 float16::operator - (const float16 &f)
{
return (float16(this->toDouble() * c.toDouble());
return float16(this->toDouble() - f.toDouble());
}
float16 float16::operator / (const float16 &c)
float16 float16::operator * (const float16 &f)
{
return (float16(this->toDouble() / c.toDouble());
return float16(this->toDouble() * f.toDouble());
}
*/
/*
// BASIC MATH II
float16& float16::operator += (const float16 &c)
float16 float16::operator / (const float16 &f)
{
return float16(this->toDouble() / f.toDouble());
}
float16& float16::operator -= (const float16 &c)
float16& float16::operator += (const float16 &f)
{
*this = this->toDouble() + f.toDouble();
return *this;
}
float16& float16::operator *= (const float16 &c)
float16& float16::operator -= (const float16 &f)
{
*this = this->toDouble() - f.toDouble();
return *this;
}
float16& float16::operator /= (const float16 &c)
float16& float16::operator *= (const float16 &f)
{
*this = this->toDouble() * f.toDouble();
return *this;
}
*/
float16& float16::operator /= (const float16 &f)
{
*this = this->toDouble() / f.toDouble();
return *this;
}
float float16::f16tof32(uint16_t n) const
//////////////////////////////////////////////////////////
//
// MATH HELPER FUNCTIONS
//
int float16::sign()
{
if (_value & 0x8000) return -1;
if (_value & 0xFFFF) return 1;
return 0;
}
bool float16::isZero()
{
return ((_value & 0x7FFF) == 0x0000);
}
// bool float16::isNaN()
// {
// return ((_value & 0x7FFF) == 0x0000);
// }
bool float16::isInf()
{
return ((_value == 0x7C00) || (_value == 0xFC00));
}
//////////////////////////////////////////////////////////
//
// CORE CONVERSION
//
float float16::f16tof32(uint16_t _value) const
{
uint16_t sgn, man;
int exp;
double f;
sgn = (n & 0x8000) > 0;
exp = (n & 0x7C00) >> 10;
man = (n & 0x03FF);
#ifdef DEBUG
Serial.println(sgn, BIN);
Serial.println(exp, BIN);
Serial.println(man, BIN);
#endif
sgn = (_value & 0x8000) > 0;
exp = (_value & 0x7C00) >> 10;
man = (_value & 0x03FF);
// ZERO
if ((n & 0x7FFF) == 0)
if ((_value & 0x7FFF) == 0)
{
#ifdef DEBUG
Serial.println("ZERO");
#endif
return sgn ? -0 : 0;
}
// NAN & INF
if (exp == 0x001F)
{
#ifdef DEBUG
Serial.println("INFINITY");
#endif
if (man == 0) return sgn ? -INFINITY : INFINITY;
else return NAN;
}
@ -193,10 +238,6 @@ uint16_t float16::f32tof16(float f) const
int16_t exp = (t & 0x7F800000) >> 23;
bool sgn = (t & 0x80000000);
// Serial.print("SGN: "); Serial.println(sgn, BIN);
// Serial.print("EXP: "); Serial.println(exp, BIN);
// Serial.print("MAN: "); Serial.println(man, BIN);
// handle 0
if ((t & 0x7FFFFFFF) == 0)
{
@ -241,9 +282,6 @@ uint16_t float16::f32tof16(float f) const
exp <<= 10;
man++;
man >>= 1;
// Serial.print("SGN: "); Serial.println(sgn, BIN);
// Serial.print("EXP: "); Serial.println(exp, BIN);
// Serial.print("MAN: "); Serial.println(man, BIN);
if (sgn) return 0x8000 | exp | man;
return exp | man;
}

View File

@ -2,7 +2,7 @@
//
// FILE: float16.h
// AUTHOR: Rob Tillaart
// VERSION: 0.1.4
// VERSION: 0.1.5
// PURPOSE: Arduino library to implement float16 data type.
// half-precision floating point format,
// used for efficient storage and transport.
@ -12,61 +12,59 @@
#include "Arduino.h"
#define FLOAT16_LIB_VERSION "0.1.4"
#define FLOAT16_LIB_VERSION (F("0.1.5"))
class float16: public Printable
{
public:
// Constructors
float16(void) { n = 0; };
float16(void) { _value = 0x0000; };
float16(double f);
float16(const float16 &f) { n = f.n; };
float16(const float16 &f) { _value = f._value; };
// Conversion
double toDouble(void) const;
// access the 2 byte representation.
uint16_t getBinary() { return n; };
void setBinary(uint16_t u) { n = u; };
uint16_t getBinary() { return _value; };
void setBinary(uint16_t u) { _value = u; };
// Printable
size_t printTo(Print& p) const;
void setDecimals(uint8_t d) { _decimals = d; };
uint8_t getDecimals() { return _decimals; };
// equalities
bool operator == (const float16& f);
bool operator != (const float16& f);
// bool isNaN();
// bool isInf();
bool operator > (const float16& f);
bool operator >= (const float16& f);
bool operator < (const float16& f);
bool operator <= (const float16& f);
// negation
float16 operator - ();
// equalities
bool operator == (const float16&);
bool operator != (const float16&);
bool operator > (const float16&);
bool operator >= (const float16&);
bool operator < (const float16&);
bool operator <= (const float16&);
/*
// basic math
float16 operator + (const float16&);
float16 operator - (const float16&);
float16 operator * (const float16&);
float16 operator / (const float16&);
float16 operator + (const float16& f);
float16 operator - (const float16& f);
float16 operator * (const float16& f);
float16 operator / (const float16& f);
float16& operator += (const float16&);
float16& operator -= (const float16&);
float16& operator *= (const float16&);
float16& operator /= (const float16&);
*/
float16& operator += (const float16& f);
float16& operator -= (const float16& f);
float16& operator *= (const float16& f);
float16& operator /= (const float16& f);
// math helper functions
int sign(); // 1 = positive 0 = zero -1 = negative.
bool isZero();
// bool isNaN();
bool isInf();
// DEBUGGING
// CORE CONVERSION
// should be private but for testing...
float f16tof32(uint16_t) const;
uint16_t f32tof16(float) const;
@ -74,10 +72,7 @@ class float16: public Printable
private:
uint8_t _decimals = 4;
// TODO
// n is not descriptive,
// should be _n at least;
uint16_t n;
uint16_t _value;
};

View File

@ -15,7 +15,7 @@
"type": "git",
"url": "https://github.com/RobTillaart/float16.git"
},
"version": "0.1.4",
"version": "0.1.5",
"license": "MIT",
"frameworks": "arduino",
"platforms": "*",

View File

@ -1,5 +1,5 @@
name=float16
version=0.1.4
version=0.1.5
author=Rob Tillaart <rob.tillaart@gmail.com>
maintainer=Rob Tillaart <rob.tillaart@gmail.com>
sentence=Arduino library to implement float16 data type.