From 4bb93dad77207c3968e29b42b366904a41000338 Mon Sep 17 00:00:00 2001 From: Rob Tillaart Date: Thu, 19 Sep 2024 14:18:47 +0200 Subject: [PATCH] 0.4.0 FastShiftIn --- libraries/FastShiftIn/CHANGELOG.md | 8 + libraries/FastShiftIn/FastShiftIn.cpp | 186 +++++++++++++++--- libraries/FastShiftIn/FastShiftIn.h | 6 +- .../fastShiftIn_test/fastShiftIn_test.ino | 23 ++- .../fastShiftIn_test/performance_0.4.0.txt | 51 +++++ libraries/FastShiftIn/library.json | 2 +- libraries/FastShiftIn/library.properties | 2 +- libraries/FastShiftIn/readme.md | 82 +++++--- 8 files changed, 288 insertions(+), 72 deletions(-) create mode 100644 libraries/FastShiftIn/examples/fastShiftIn_test/performance_0.4.0.txt diff --git a/libraries/FastShiftIn/CHANGELOG.md b/libraries/FastShiftIn/CHANGELOG.md index 7a57129c..555b4fef 100644 --- a/libraries/FastShiftIn/CHANGELOG.md +++ b/libraries/FastShiftIn/CHANGELOG.md @@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). +## [0.4.0] - 2024-09-10 +- fix #17, loop unroll option, improving performance, kudos to nt314p +- added flag to select LOOP UNROLL (is optional as it gives larger code size) +- update readme.md +- minor edits + +---- + ## [0.3.4] - 2024-07-22 - add **void read(uint8_t \*array, uint8_t size)** (experimental) - update readme.md diff --git a/libraries/FastShiftIn/FastShiftIn.cpp b/libraries/FastShiftIn/FastShiftIn.cpp index c327b68c..9c9bb84f 100644 --- a/libraries/FastShiftIn/FastShiftIn.cpp +++ b/libraries/FastShiftIn/FastShiftIn.cpp @@ -1,7 +1,7 @@ // // FILE: FastShiftIn.cpp // AUTHOR: Rob Tillaart -// VERSION: 0.3.4 +// VERSION: 0.4.0 // PURPOSE: Fast ShiftIn for 74HC165 register, AVR optimized // DATE: 2013-09-29 // URL: https://github.com/RobTillaart/FastShiftIn @@ -148,36 +148,98 @@ uint8_t FastShiftIn::readLSBFIRST() { #if defined(ARDUINO_ARCH_AVR) || defined(ARDUINO_ARCH_MEGAAVR) +#if defined(FASTSHIFTIN_AVR_LOOP_UNROLLED) // AVR SPEED OPTIMIZED #17 + + uint8_t rv = 0; + uint8_t cbmask1 = _clockBit; + uint8_t inmask1 = _dataInBit; + + volatile uint8_t* localDataInRegister = _dataInRegister; + volatile uint8_t* localClockRegister = _clockRegister; + + // disable interrupts (for all bits) + uint8_t oldSREG = SREG; + noInterrupts(); + + uint8_t r = *localClockRegister; + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x01; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x02; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x04; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x08; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x10; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x20; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x40; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x80; // read one bit + *localClockRegister = r; // clock pulse LOW + + // restore interrupt state + SREG = oldSREG; + + _lastValue = rv; + +#else // AVR SIZE OPTIMIZED + uint8_t rv = 0; uint8_t cbmask1 = _clockBit; - uint8_t cbmask2 = ~_clockBit; uint8_t inmask1 = _dataInBit; + volatile uint8_t* localDataInRegister = _dataInRegister; + volatile uint8_t* localClockRegister = _clockRegister; + + // disable interrupts (for all bits) + uint8_t oldSREG = SREG; + noInterrupts(); + + uint8_t r = *localClockRegister; + for (uint8_t m = 0x01; m > 0; m <<= 1) { - // remember state register - uint8_t oldSREG = SREG; - // disable interrupts - noInterrupts(); // clock pulse HIGH - *_clockRegister |= cbmask1; + *localClockRegister |= cbmask1; // read one bit - if ((*_dataInRegister & inmask1) > 0) rv |= m; + if ((*localDataInRegister & inmask1) > 0) rv |= m; // clock pulse LOW - *_clockRegister &= cbmask2; - // reset interrupts flag to previous state - SREG = oldSREG; + *localClockRegister = r; } - _lastValue = rv; - return rv; -#else + // reset interrupts flag to previous state + SREG = oldSREG; + + _lastValue = rv; + +#endif // if (AVR) + +#else // other platforms reference shiftOut() // reference implementation _lastValue = shiftIn(_dataPinIn, _clockPin, LSBFIRST); - return _lastValue; #endif + + // all paths will return _lastValue. + return _lastValue; } @@ -185,37 +247,97 @@ uint8_t FastShiftIn::readMSBFIRST() { #if defined(ARDUINO_ARCH_AVR) || defined(ARDUINO_ARCH_MEGAAVR) - uint8_t rv = 0; - uint8_t cbmask1 = _clockBit; - uint8_t cbmask2 = ~_clockBit; - uint8_t inmask1 = _dataInBit; +#if defined(FASTSHIFTIN_AVR_LOOP_UNROLLED) // AVR SPEED OPTIMIZED + uint8_t rv = 0; + uint8_t cbmask1 = _clockBit; + uint8_t inmask1 = _dataInBit; + + volatile uint8_t* localDataInRegister = _dataInRegister; + volatile uint8_t* localClockRegister = _clockRegister; + + // disable interrupts (for all bits) + uint8_t oldSREG = SREG; + noInterrupts(); + + uint8_t r = *localClockRegister; + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x80; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x40; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x20; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x10; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x08; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x04; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x02; // read one bit + *localClockRegister = r; // clock pulse LOW + + *localClockRegister |= cbmask1; // clock pulse HIGH + if ((*localDataInRegister & inmask1) > 0) rv |= 0x01; // read one bit + *localClockRegister = r; // clock pulse LOW + + // restore interrupt state + SREG = oldSREG; + + _lastValue = rv; + +#else // AVR SIZE OPTIMIZED + + uint8_t rv = 0; + uint8_t cbmask1 = _clockBit; + uint8_t inmask1 = _dataInBit; + + volatile uint8_t* localDataInRegister = _dataInRegister; + volatile uint8_t* localClockRegister = _clockRegister; + + // disable interrupts (for all bits) + uint8_t oldSREG = SREG; + noInterrupts(); + + uint8_t r = *localClockRegister; for (uint8_t m = 0x80; m > 0; m >>= 1) { - // remember state register - uint8_t oldSREG = SREG; - // disable interrupts - noInterrupts(); // clock pulse HIGH - *_clockRegister |= cbmask1; + *localClockRegister |= cbmask1; // read one bit - if ((*_dataInRegister & inmask1) > 0) rv |= m; + if ((*localDataInRegister & inmask1) > 0) rv |= m; // clock pulse LOW - *_clockRegister &= cbmask2; - // reset interrupts flag to previous state - SREG = oldSREG; + *localClockRegister = r; } - _lastValue = rv; - return rv; -#else + // reset interrupts flag to previous state + SREG = oldSREG; + + _lastValue = rv; + +#endif // if (AVR) + +#else // other platforms reference shiftOut() // reference implementation _lastValue = shiftIn(_dataPinIn, _clockPin, MSBFIRST); - return _lastValue; #endif + // all paths will return _lastValue. + return _lastValue; } diff --git a/libraries/FastShiftIn/FastShiftIn.h b/libraries/FastShiftIn/FastShiftIn.h index e61dfb91..a18f894f 100644 --- a/libraries/FastShiftIn/FastShiftIn.h +++ b/libraries/FastShiftIn/FastShiftIn.h @@ -2,7 +2,7 @@ // // FILE: FastShiftIn.h // AUTHOR: Rob Tillaart -// VERSION: 0.3.4 +// VERSION: 0.4.0 // PURPOSE: Fast ShiftIn for 74HC165 register, AVR optimized // DATE: 2013-09-29 // URL: https://github.com/RobTillaart/FastShiftIn @@ -11,8 +11,10 @@ #include "Arduino.h" -#define FASTSHIFTIN_LIB_VERSION (F("0.3.4")) +#define FASTSHIFTIN_LIB_VERSION (F("0.4.0")) +// uncomment next line to get SPEED OPTIMIZED CODE +// #define FASTSHIFTIN_AVR_LOOP_UNROLLED 1 class FastShiftIn { diff --git a/libraries/FastShiftIn/examples/fastShiftIn_test/fastShiftIn_test.ino b/libraries/FastShiftIn/examples/fastShiftIn_test/fastShiftIn_test.ino index f719dbd1..92625f83 100644 --- a/libraries/FastShiftIn/examples/fastShiftIn_test/fastShiftIn_test.ino +++ b/libraries/FastShiftIn/examples/fastShiftIn_test/fastShiftIn_test.ino @@ -19,7 +19,7 @@ void setup() Serial.println(__FILE__); Serial.println(FASTSHIFTIN_LIB_VERSION); - FSI.setBitOrder(LSBFIRST); + FSI.setBitOrder(MSBFIRST); digitalWrite(12, HIGH); Serial.println("\n 8 bits HIGH\n"); @@ -59,6 +59,7 @@ void test_read() duration1 = micros() - start; Serial.print("FastShiftIn1: "); Serial.println(duration1 * 0.001); + delay(100); start = micros(); for (int i = 0; i < 1000; i++) @@ -86,7 +87,8 @@ void test_read16() duration1 = micros() - start; Serial.print("FastShiftIn1: "); Serial.println(duration1 * 0.001); - + delay(100); + start = micros(); for (int i = 0; i < 1000; i++) { @@ -113,7 +115,8 @@ void test_read24() duration1 = micros() - start; Serial.print("FastShiftIn1: "); Serial.println(duration1 * 0.001); - + delay(100); + start = micros(); for (int i = 0; i < 1000; i++) { @@ -140,7 +143,8 @@ void test_read32() duration1 = micros() - start; Serial.print("FastShiftIn1: "); Serial.println(duration1 * 0.001); - + delay(100); + start = micros(); for (int i = 0; i < 1000; i++) { @@ -167,7 +171,8 @@ void test_readLSBFIRST() duration1 = micros() - start; Serial.print("FastShiftIn1: "); Serial.println(duration1 * 0.001); - + delay(100); + start = micros(); for (int i = 0; i < 1000; i++) { @@ -194,7 +199,8 @@ void test_readMSBFIRST() duration1 = micros() - start; Serial.print("FastShiftIn1: "); Serial.println(duration1 * 0.001); - + delay(100); + start = micros(); for (int i = 0; i < 1000; i++) { @@ -221,7 +227,8 @@ void test_reference() duration1 = micros() - start; Serial.print("Standard shiftIn1: "); Serial.println(duration1 * 0.001); - + delay(100); + start = micros(); for (int i = 0; i < 1000; i++) { @@ -243,4 +250,4 @@ void loop() } -// -- END OF FILE -- +// -- END OF FILE -- diff --git a/libraries/FastShiftIn/examples/fastShiftIn_test/performance_0.4.0.txt b/libraries/FastShiftIn/examples/fastShiftIn_test/performance_0.4.0.txt new file mode 100644 index 00000000..4a0e5f81 --- /dev/null +++ b/libraries/FastShiftIn/examples/fastShiftIn_test/performance_0.4.0.txt @@ -0,0 +1,51 @@ +Arduino UNO +IDE 1.8.19 + +fastShiftIn_test.ino +0.4.0 + + 8 bits HIGH + + +Performance - time in us : read() +FastShiftIn1: 12.51 +FastShiftIn2: 23.77 + Delta: 11.26 + + +Performance - time in us : read16() +FastShiftIn1: 23.28 +FastShiftIn2: 45.78 + Delta: 22.50 + + +Performance - time in us : read24() +FastShiftIn1: 35.54 +FastShiftIn2: 70.30 + Delta: 34.76 + + +Performance - time in us : read32() +FastShiftIn1: 46.41 +FastShiftIn2: 92.05 + Delta: 45.64 + + +Performance - time in us : readLSBFIRST() +FastShiftIn1: 12.83 +FastShiftIn2: 24.77 + Delta: 11.94 + + +Performance - time in us : readMSBFIRST() +FastShiftIn1: 11.38 +FastShiftIn2: 21.88 + Delta: 10.50 + + +Performance - time in us : reference shiftIn() +Standard shiftIn1: 108.99 +Standard shiftIn2: 217.04 + Delta: 108.05 + +done... \ No newline at end of file diff --git a/libraries/FastShiftIn/library.json b/libraries/FastShiftIn/library.json index 7e3a474f..edbe25d9 100644 --- a/libraries/FastShiftIn/library.json +++ b/libraries/FastShiftIn/library.json @@ -15,7 +15,7 @@ "type": "git", "url": "https://github.com/RobTillaart/FastShiftIn.git" }, - "version": "0.3.4", + "version": "0.4.0", "license": "MIT", "frameworks": "*", "platforms": "*", diff --git a/libraries/FastShiftIn/library.properties b/libraries/FastShiftIn/library.properties index 2c5d5eeb..bdff2f63 100644 --- a/libraries/FastShiftIn/library.properties +++ b/libraries/FastShiftIn/library.properties @@ -1,5 +1,5 @@ name=FastShiftIn -version=0.3.4 +version=0.4.0 author=Rob Tillaart maintainer=Rob Tillaart sentence=Arduino library for (AVR) optimized shiftIn - e.g. for 74HC165 diff --git a/libraries/FastShiftIn/readme.md b/libraries/FastShiftIn/readme.md index f125ef25..cfc937e0 100644 --- a/libraries/FastShiftIn/readme.md +++ b/libraries/FastShiftIn/readme.md @@ -17,7 +17,7 @@ Arduino library for **AVR** optimized shiftIn - e.g. for 74HC165. ## Description FastShiftIn is a class that has optimized code (AVR only) to shift in data faster -than the normal **shiftIn()** function. +than the default provided **shiftIn()** function. It speeds up the shift using low level ports and masks. These are predetermined in the constructor of the FastShiftIn object. @@ -26,13 +26,22 @@ to the default **shiftIn()** implementation. The library allows to set (and get) the bitOrder and apply this to multiple read() calls. It also provide access to **readLSBFIRST()** and **readMSBFIRST()** which -are the low level workers and most optimized code (so far). +are the low level workers and most optimized code (so far). The library provides wrapper functions to read multi-byte variables. These are read16(), read24(), read32() and read(array, size). The latter is used to shift in any size object. +### 0.4.0 breaking changes + +The 0.4.0 version has a flag to unroll the inner loop in **readLSBFIRST()** +and **readMSBFIRST()**. The AVR optimized code blocks the interrupts per byte. + +Note: this optimization is new and thus experimental. +Feedback, including improvements, is welcome. + + ### Performance The performance of **read()** is substantially faster for **AVR** than the default @@ -40,28 +49,36 @@ Arduino **shiftIn()**, but not as fast as HW SPI. Exact how large the performance gain is can be seen with the example sketch. It does a comparison and shows how the class is to be used. -Time in microseconds, Arduino UNO -| function | 0.2.3 | 0.3.2 | -|:---------------------|---------:|---------:| -| read() | 19.30 | 20.49 | -| read16() | | 41.04 | -| read24() | | 62.91 | -| read32() | | 83.95 | -| readLSBFIRST() | 19.04 | 19.92 | -| readMSBFIRST() | 19.04 | 19.92 | -| reference shiftIn() | 107.82 | 108.20 | +#### Measurements + +Numbers may vary depending on bit-order flag. + +Indicative time in microseconds, Arduino UNO, IDE 1.8.19, measured over 1000 calls. + +| function | 0.2.3 | 0.3.2 | 0.4.0 | 0.4.0L | +|:---------------------|---------:|---------:|---------:|---------:| +| read() | 19.30 | 20.49 | 12.71 | 9.51 | +| read16() | | 41.04 | 25.39 | 18.98 | +| read24() | | 62.91 | 39.10 | 29.48 | +| read32() | | 83.95 | 51.42 | 38.60 | +| readLSBFIRST() | 19.04 | 19.92 | 11.96 | 8.81 | +| readMSBFIRST() | 19.04 | 19.92 | 11.94 | 8.75 | +| reference shiftIn() | 107.82 | 108.20 | 108.05 | 108.05 | -0.3.2 is a bit slower (incl. reference) than 0.2.3 but still much -faster than the reference. +- Note: 0.3.2 is a bit slower (incl. reference) than 0.2.3 but still much +faster than the reference. (Older IDE?) +- Note: 0.4.0 improved test sketch, +- Note: 0.4.0 measured with loop unroll flag disabled. +- Note: 0.4.0L measured with loop unrolled flag enabled. ### Related libraries - https://github.com/RobTillaart/FastShiftIn -- https://github.com/RobTillaart/FastShiftOut - https://github.com/RobTillaart/FastShiftInOut +- https://github.com/RobTillaart/FastShiftOut - https://github.com/RobTillaart/ShiftInSlow - https://github.com/RobTillaart/ShiftOutSlow @@ -74,6 +91,8 @@ faster than the reference. ### Constructor +bitOrder = { LSBFIRST, MSBFIRST }; + - **FastShiftIn(uint8_t dataIn, uint8_t clockPin, uint8_t bitOrder = LSBFIRST)** Constructor ### Functions @@ -83,16 +102,18 @@ faster than the reference. - **uint32_t read24(void)** reads a new value, 24 bit. - **uint32_t read32(void)** reads a new value, 32 bit. - **uint32_t lastRead()** returns last value read. - -### Meta - -- **bool setBitOrder(uint8_t bitOrder)** set LSBFIRST or MSBFIRST. -Returns false for other values. -- **uint8_t getBitOrder(void)** returns LSBFIRST or MSBFIRST. - **uint16_t readLSBFIRST(void)** optimized LSB read(), 8 bit. - **uint16_t readMSBFIRST(void)** optimized MSB read(), 8 bit. +### BitOrder + +- **bool setBitOrder(uint8_t bitOrder)** set LSBFIRST or MSBFIRST. +Returns false for other values ==> no change. +- **uint8_t getBitOrder(void)** returns LSBFIRST or MSBFIRST as set in the constructor +or latest set from **setBitOrder()**. + + ### Experimental - **void read(uint8_t \*array, uint8_t size)** read an array of values. @@ -116,6 +137,8 @@ If the BIT-order is not the BYTE-order, the user has two options - call **read()** multiple times and merge the bytes in the order needed. - call **read32()** (a.o) and reorder the bytes in a separate function. +The library will not support such functionality. + ## Notes @@ -128,6 +151,7 @@ pull up resistors, especially if wires are exceeding 10 cm (4"). #### Must +- keep in sync with FastShiftOut() #### Should @@ -135,18 +159,20 @@ pull up resistors, especially if wires are exceeding 10 cm (4"). #### Could +- investigate ESP32 optimization readLSBFIRST readMSBFIRST +- performance ESP32 +- example schema +- add invert flag? +- would it be interesting to make a fastShiftIn16() etc? + - squeeze performance but more maintenance.? + +#### Wont + - investigate separate **BYTE**-order, - only MSBFirst and LSBFirst - **void setByteOrder()** + **uint8_t getByteOrder()** - other option is add parameters / overload to make byte order explicit - **read32(1,0,3,2)** performance penalty + invalid combination. -- investigate ESP32 optimization readLSBFIRST readMSBFIRST -- example schemas -- would it be interesting to make a fastShiftIn16() etc? - - squeeze performance but more maintenance.? - -#### Wont - ## Support