0.1.8 float16

This commit is contained in:
Rob Tillaart 2023-11-02 15:12:29 +01:00
parent 299170332e
commit 7d0ed91f95
8 changed files with 298 additions and 181 deletions

View File

@ -6,13 +6,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).
## [0.1.8] - 2023-11-02
- update readme.md
- add **isNan()** (experimental).
- minor edits.
## [0.1.7] - 2022-11-07
- add changelog.md
- add rp2040 to build-CI
- update readme.md
- update keywords.txt
## [0.1.6] - 2021-12-18
- update library.json
- update license

View File

@ -1,11 +1,12 @@
[![Arduino CI](https://github.com/RobTillaart/float16/workflows/Arduino%20CI/badge.svg)](https://github.com/marketplace/actions/arduino_ci)
[![Arduino-lint](https://github.com/RobTillaart/float16/actions/workflows/arduino-lint.yml/badge.svg)](https://github.com/RobTillaart/float16/actions/workflows/arduino-lint.yml)
[![JSON check](https://github.com/RobTillaart/float16/actions/workflows/jsoncheck.yml/badge.svg)](https://github.com/RobTillaart/float16/actions/workflows/jsoncheck.yml)
[![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/RobTillaart/float16/blob/master/LICENSE)
[![GitHub release](https://img.shields.io/github/release/RobTillaart/float16.svg?maxAge=3600)](https://github.com/RobTillaart/Complex/releases)
[![GitHub issues](https://img.shields.io/github/issues/RobTillaart/float16.svg)](https://github.com/RobTillaart/float16/issues)
[![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/RobTillaart/float16/blob/master/LICENSE)
[![GitHub release](https://img.shields.io/github/release/RobTillaart/float16.svg?maxAge=3600)](https://github.com/RobTillaart/float16/releases)
[![PlatformIO Registry](https://badges.registry.platformio.org/packages/robtillaart/library/float16.svg)](https://registry.platformio.org/libraries/robtillaart/float16)
# float16
@ -30,7 +31,7 @@ a floating point number. As it uses only 2 bytes where float and double have typ
| attribute | value | notes |
|:----------|:-------------|:--------|
| size | 2 bytes | layout s eeeee mmmmmmmmmm
| size | 2 bytes | layout s eeeee mmmmmmmmmm (1,5,10)
| sign | 1 bit |
| exponent | 5 bit |
| mantissa | 11 bit | ~ 3 digits
@ -66,9 +67,16 @@ a floating point number. As it uses only 2 bytes where float and double have typ
```
#### Related
- https://wokwi.com/projects/376313228108456961 (demo of its usage)
## Interface
to elaborate
```cpp
#include "float16.h"
```
#### Constructors
@ -126,20 +134,25 @@ negation operator.
- **bool isInf()** returns true if value is (-)infinite.
#### Experimental 0.1.8
- **bool isNaN()** returns true if value is not a number.
## Notes
## Future
#### 0.1.x
#### Must
- update documentation.
#### Should
- unit tests of the above.
- isNan().
#### later
#### Could
- update documentation.
- error handling.
@ -149,3 +162,14 @@ negation operator.
- add storage example - with SD card, FRAM or EEPROM
- add communication example - serial or Ethernet?
#### Wont
## Support
If you appreciate my libraries, you can support the development and maintenance.
Improve the quality of the libraries by providing issues and Pull Requests, or
donate through PayPal or GitHub sponsors.
Thank you,

View File

@ -0,0 +1,88 @@
//
// test: UNO
// IDE: 1.8.13
//
FLOAT16_LIB_VERSION: 0.1.7
CONSTRUCTORS
Constructor: 28
a = b: 4
CONVERSION
toDouble(): 428
COMPARE
compare == : 4
compare != : 4
compare < : 4
compare <= : 4
compare >= : 4
compare > : 8
MATH I
math + : 864
math - : 812
math * : 812
math / : 812
MATH II
math += : 872
math -= : 820
math *= : 804
math /= : 804
17.8125
MATH III - negation
negation : 4
-17.8125
done
------------------------------------------
//
// test: RP2040
// IDE: 1.8.13
//
FLOAT16_LIB_VERSION: 0.1.7
CONSTRUCTORS
Constructor: 19
a = b: 2
CONVERSION
toDouble(): 96
COMPARE
compare == : 4
compare != : 3
compare < : 7
compare <= : 6
compare >= : 6
compare > : 6
MATH I
math + : 67
math - : 60
math * : 58
math / : 54
MATH II
math += : 62
math -= : 61
math *= : 61
math /= : 45
767.0000
MATH III - negation
negation : 7
-767.0000
done
-------------------------------------------

View File

@ -1,30 +1,26 @@
//
// FILE: float16.cpp
// AUTHOR: Rob Tillaart
// VERSION: 0.1.7
// VERSION: 0.1.8
// PURPOSE: library for Float16s for Arduino
// URL: http://en.wikipedia.org/wiki/Half-precision_floating-point_format
//
// HISTORY: see changelog.md
#include "float16.h"
// #define DEBUG
// CONSTRUCTOR
// CONSTRUCTOR
float16::float16(double f)
{
_value = f32tof16(f);
}
// PRINTING
// PRINTING
size_t float16::printTo(Print& p) const
{
double d = this->f16tof32(_value);
return p.print(d, _decimals);
};
}
double float16::toDouble() const
{
@ -34,54 +30,54 @@ double float16::toDouble() const
//////////////////////////////////////////////////////////
//
// EQUALITIES
// EQUALITIES
//
bool float16::operator == (const float16 &f)
{
return (_value == f._value);
return (_value == f._value);
}
bool float16::operator != (const float16 &f)
{
return (_value != f._value);
return (_value != f._value);
}
bool float16::operator > (const float16 &f)
{
if ((_value & 0x8000) && ( f._value & 0x8000)) return _value < f._value;
if (_value & 0x8000) return false;
if (f._value & 0x8000) return true;
return _value > f._value;
if ((_value & 0x8000) && ( f._value & 0x8000)) return _value < f._value;
if (_value & 0x8000) return false;
if (f._value & 0x8000) return true;
return _value > f._value;
}
bool float16::operator >= (const float16 &f)
{
if ((_value & 0x8000) && (f._value & 0x8000)) return _value <= f._value;
if (_value & 0x8000) return false;
if (f._value & 0x8000) return true;
return _value >= f._value;
if ((_value & 0x8000) && (f._value & 0x8000)) return _value <= f._value;
if (_value & 0x8000) return false;
if (f._value & 0x8000) return true;
return _value >= f._value;
}
bool float16::operator < (const float16 &f)
{
if ((_value & 0x8000) && (f._value & 0x8000)) return _value > f._value;
if (_value & 0x8000) return true;
if (f._value & 0x8000) return false;
return _value < f._value;
if ((_value & 0x8000) && (f._value & 0x8000)) return _value > f._value;
if (_value & 0x8000) return true;
if (f._value & 0x8000) return false;
return _value < f._value;
}
bool float16::operator <= (const float16 &f)
{
if ((_value & 0x8000) && (f._value & 0x8000)) return _value >= f._value;
if (_value & 0x8000) return true;
if (f._value & 0x8000) return false;
return _value <= f._value;
if ((_value & 0x8000) && (f._value & 0x8000)) return _value >= f._value;
if (_value & 0x8000) return true;
if (f._value & 0x8000) return false;
return _value <= f._value;
}
//////////////////////////////////////////////////////////
//
// NEGATION
// NEGATION
//
float16 float16::operator - ()
{
@ -93,7 +89,7 @@ float16 float16::operator - ()
//////////////////////////////////////////////////////////
//
// MATH
// MATH
//
float16 float16::operator + (const float16 &f)
{
@ -142,9 +138,8 @@ float16& float16::operator /= (const float16 &f)
//////////////////////////////////////////////////////////
//
// MATH HELPER FUNCTIONS
// MATH HELPER FUNCTIONS
//
int float16::sign()
{
if (_value & 0x8000) return -1;
@ -157,10 +152,12 @@ bool float16::isZero()
return ((_value & 0x7FFF) == 0x0000);
}
// bool float16::isNaN()
// {
// return ((_value & 0x7FFF) == 0x0000);
// }
bool float16::isNaN()
{
if ((_value & 0x7C00) != 0x7C00) return false;
if ((_value & 0x03FF) == 0x0000) return false;
return true;
}
bool float16::isInf()
{
@ -170,104 +167,104 @@ bool float16::isInf()
//////////////////////////////////////////////////////////
//
// CORE CONVERSION
// CORE CONVERSION
//
float float16::f16tof32(uint16_t _value) const
{
uint16_t sgn, man;
int exp;
double f;
uint16_t sgn, man;
int exp;
double f;
sgn = (_value & 0x8000) > 0;
exp = (_value & 0x7C00) >> 10;
man = (_value & 0x03FF);
sgn = (_value & 0x8000) > 0;
exp = (_value & 0x7C00) >> 10;
man = (_value & 0x03FF);
// ZERO
if ((_value & 0x7FFF) == 0)
{
return sgn ? -0 : 0;
}
// NAN & INF
if (exp == 0x001F)
{
if (man == 0) return sgn ? -INFINITY : INFINITY;
else return NAN;
}
// ZERO
if ((_value & 0x7FFF) == 0)
{
return sgn ? -0 : 0;
}
// NAN & INF
if (exp == 0x001F)
{
if (man == 0) return sgn ? -INFINITY : INFINITY;
else return NAN;
}
// SUBNORMAL/NORMAL
if (exp == 0) f = 0;
else f = 1;
// SUBNORMAL/NORMAL
if (exp == 0) f = 0;
else f = 1;
// PROCESS MANTISSE
for (int i = 9; i >= 0; i--)
{
f *= 2;
if (man & (1 << i)) f = f + 1;
}
f = f * pow(2.0, exp - 25);
if (exp == 0)
{
f = f * pow(2.0, -13); // 5.96046447754e-8;
}
return sgn ? -f : f;
// PROCESS MANTISSE
for (int i = 9; i >= 0; i--)
{
f *= 2;
if (man & (1 << i)) f = f + 1;
}
f = f * pow(2.0, exp - 25);
if (exp == 0)
{
f = f * pow(2.0, -13); // 5.96046447754e-8;
}
return sgn ? -f : f;
}
uint16_t float16::f32tof16(float f) const
{
uint32_t t = *(uint32_t *) &f;
// man bits = 10; but we keep 11 for rounding
uint16_t man = (t & 0x007FFFFF) >> 12;
int16_t exp = (t & 0x7F800000) >> 23;
bool sgn = (t & 0x80000000);
uint32_t t = *(uint32_t *) &f;
// man bits = 10; but we keep 11 for rounding
uint16_t man = (t & 0x007FFFFF) >> 12;
int16_t exp = (t & 0x7F800000) >> 23;
bool sgn = (t & 0x80000000);
// handle 0
if ((t & 0x7FFFFFFF) == 0)
{
return sgn ? 0x8000 : 0x0000;
}
// denormalized float32 does not fit in float16
if (exp == 0x00)
{
return sgn ? 0x8000 : 0x0000;
}
// handle infinity & NAN
if (exp == 0x00FF)
{
if (man) return 0xFE00; // NAN
return sgn ? 0xFC00 : 0x7C00; // -INF : INF
}
// handle 0
if ((t & 0x7FFFFFFF) == 0)
{
return sgn ? 0x8000 : 0x0000;
}
// denormalized float32 does not fit in float16
if (exp == 0x00)
{
return sgn ? 0x8000 : 0x0000;
}
// handle infinity & NAN
if (exp == 0x00FF)
{
if (man) return 0xFE00; // NAN
return sgn ? 0xFC00 : 0x7C00; // -INF : INF
}
// normal numbers
exp = exp - 127 + 15;
// overflow does not fit => INF
if (exp > 30)
{
return sgn ? 0xFC00 : 0x7C00; // -INF : INF
}
// subnormal numbers
if (exp < -38)
{
return sgn ? 0x8000 : 0x0000; // -0 or 0 ? just 0 ?
}
if (exp <= 0) // subnormal
{
man >>= (exp + 14);
// rounding
man++;
man >>= 1;
if (sgn) return 0x8000 | man;
return man;
}
// normal
// TODO rounding
exp <<= 10;
// normal numbers
exp = exp - 127 + 15;
// overflow does not fit => INF
if (exp > 30)
{
return sgn ? 0xFC00 : 0x7C00; // -INF : INF
}
// subnormal numbers
if (exp < -38)
{
return sgn ? 0x8000 : 0x0000; // -0 or 0 ? just 0 ?
}
if (exp <= 0) // subnormal
{
man >>= (exp + 14);
// rounding
man++;
man >>= 1;
if (sgn) return 0x8000 | exp | man;
return exp | man;
if (sgn) return 0x8000 | man;
return man;
}
// normal
// TODO rounding
exp <<= 10;
man++;
man >>= 1;
if (sgn) return 0x8000 | exp | man;
return exp | man;
}
// -- END OF FILE --
// -- END OF FILE --

View File

@ -2,80 +2,79 @@
//
// FILE: float16.h
// AUTHOR: Rob Tillaart
// VERSION: 0.1.7
// VERSION: 0.1.8
// PURPOSE: Arduino library to implement float16 data type.
// half-precision floating point format,
// half-precision floating point format,
// used for efficient storage and transport.
// URL: https://github.com/RobTillaart/float16
//
#include "Arduino.h"
#define FLOAT16_LIB_VERSION (F("0.1.7"))
#define FLOAT16_LIB_VERSION (F("0.1.8"))
class float16: public Printable
{
public:
// Constructors
float16(void) { _value = 0x0000; };
float16(double f);
float16(const float16 &f) { _value = f._value; };
public:
// Constructors
float16(void) { _value = 0x0000; };
float16(double f);
float16(const float16 &f) { _value = f._value; };
// Conversion
double toDouble(void) const;
// access the 2 byte representation.
uint16_t getBinary() { return _value; };
void setBinary(uint16_t u) { _value = u; };
// Printable
size_t printTo(Print& p) const;
void setDecimals(uint8_t d) { _decimals = d; };
uint8_t getDecimals() { return _decimals; };
// Conversion
double toDouble(void) const;
// access the 2 byte representation.
uint16_t getBinary() { return _value; };
void setBinary(uint16_t u) { _value = u; };
// equalities
bool operator == (const float16& f);
bool operator != (const float16& f);
// Printable
size_t printTo(Print& p) const;
void setDecimals(uint8_t d) { _decimals = d; };
uint8_t getDecimals() { return _decimals; };
bool operator > (const float16& f);
bool operator >= (const float16& f);
bool operator < (const float16& f);
bool operator <= (const float16& f);
// equalities
bool operator == (const float16& f);
bool operator != (const float16& f);
// negation
float16 operator - ();
bool operator > (const float16& f);
bool operator >= (const float16& f);
bool operator < (const float16& f);
bool operator <= (const float16& f);
// basic math
float16 operator + (const float16& f);
float16 operator - (const float16& f);
float16 operator * (const float16& f);
float16 operator / (const float16& f);
// negation
float16 operator - ();
float16& operator += (const float16& f);
float16& operator -= (const float16& f);
float16& operator *= (const float16& f);
float16& operator /= (const float16& f);
// basic math
float16 operator + (const float16& f);
float16 operator - (const float16& f);
float16 operator * (const float16& f);
float16 operator / (const float16& f);
// math helper functions
int sign(); // 1 = positive 0 = zero -1 = negative.
bool isZero();
// bool isNaN();
bool isInf();
float16& operator += (const float16& f);
float16& operator -= (const float16& f);
float16& operator *= (const float16& f);
float16& operator /= (const float16& f);
// math helper functions
int sign(); // 1 = positive 0 = zero -1 = negative.
bool isZero();
bool isNaN();
bool isInf();
// CORE CONVERSION
// should be private but for testing...
float f16tof32(uint16_t) const;
uint16_t f32tof16(float) const;
// CORE CONVERSION
// should be private but for testing...
float f16tof32(uint16_t) const;
uint16_t f32tof16(float) const;
private:
uint8_t _decimals = 4;
uint16_t _value;
private:
uint8_t _decimals = 4;
uint16_t _value;
};
// -- END OF FILE --
// -- END OF FILE --

View File

@ -13,6 +13,10 @@ setBinary KEYWORD2
setDecimals KEYWORD2
getDecimals KEYWORD2
sign KEYWORD2
isZero KEYWORD2
isNan KEYWORD2
isInf KEYWORD2
# Constants (LITERAL1)
FLOAT16_LIB_VERSION LITERAL1

View File

@ -15,9 +15,9 @@
"type": "git",
"url": "https://github.com/RobTillaart/float16.git"
},
"version": "0.1.7",
"version": "0.1.8",
"license": "MIT",
"frameworks": "arduino",
"frameworks": "*",
"platforms": "*",
"headers": "float16.h"
}

View File

@ -1,5 +1,5 @@
name=float16
version=0.1.7
version=0.1.8
author=Rob Tillaart <rob.tillaart@gmail.com>
maintainer=Rob Tillaart <rob.tillaart@gmail.com>
sentence=Arduino library to implement float16 data type.