0.1.1 Soundex

This commit is contained in:
rob tillaart 2022-02-06 16:08:32 +01:00
parent ec8aeb0659
commit 1fa9bbb40a
14 changed files with 551 additions and 0 deletions

View File

@ -0,0 +1,14 @@
compile:
# Choosing to run compilation tests on 2 different Arduino platforms
platforms:
- uno
# - due
# - zero
# - leonardo
- m4
- esp32
# - esp8266
# - mega2560
libraries:
# - "printHelpers"

View File

@ -0,0 +1,13 @@
name: Arduino-lint
on: [push, pull_request]
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: arduino/arduino-lint-action@v1
with:
library-manager: update
compliance: strict

View File

@ -0,0 +1,17 @@
---
name: Arduino CI
on: [push, pull_request]
jobs:
runTest:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: ruby/setup-ruby@v1
with:
ruby-version: 2.6
- run: |
gem install arduino_ci
arduino_ci.rb

View File

@ -0,0 +1,18 @@
name: JSON check
on:
push:
paths:
- '**.json'
pull_request:
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: json-syntax-check
uses: limitusus/json-syntax-check@v1
with:
pattern: "\\.json$"

21
libraries/Soundex/LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2022-2022 Rob Tillaart
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,77 @@
[![Arduino CI](https://github.com/RobTillaart/Soundex/workflows/Arduino%20CI/badge.svg)](https://github.com/marketplace/actions/arduino_ci)
[![Arduino-lint](https://github.com/RobTillaart/Soundex/actions/workflows/arduino-lint.yml/badge.svg)](https://github.com/RobTillaart/Soundex/actions/workflows/arduino-lint.yml)
[![JSON check](https://github.com/RobTillaart/Soundex/actions/workflows/jsoncheck.yml/badge.svg)](https://github.com/RobTillaart/Soundex/actions/workflows/jsoncheck.yml)
[![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/RobTillaart/Soundex/blob/master/LICENSE)
[![GitHub release](https://img.shields.io/github/release/RobTillaart/Soundex.svg?maxAge=3600)](https://github.com/RobTillaart/Soundex/releases)
# Soundex
Arduino Library for calculating Soundex hash.
## Description
This library generates a (string based) hash based upon how a word sounds.
This algorithm is called Soundex.
The original algorithm was developed by Robert C. Russell and Margaret King Odell over 100 years ago.
There are several variations of Soundex and these might be supported in the future.
The algorithm roughly copies the uppercase first letter of the word, followed by 3 digits replacing the consonants.
The base Soundex has 26 x 7 x 7 x 7 = 8918 possible outcomes, this could be encoded in an uint16_t.
#### Links
- https://en.wikipedia.org/wiki/Soundex
- https://en.wikipedia.org/wiki/Metaphone (not implemented)
## Interface
Use **\#include "Soundex.h"**
- **Soundex()** Constructor.
- **void setLength(uint8_t length = 4)** Sets the length to include more digits. max length = 11
- **uint8_t getLength()** returns current length.
- **char \* soundex(const char \* str)** determines the (Russell & Odell) Soundex code of the string.
#### Performance
Not tested ESP32 (and many other platforms) yet.
First numbers of **.soundex(str)** measured with test sketch shows the following timing per word.
| Checksum | digits | UNO 16 MHz | ESP32 240 MHz |
|:------------|:------:|:-----------:|:-------------:|
| Soundex | 3 | 32 us | |
## Operation
See examples.
## Future ideas
- more testing
- other platforms
- different key lengths
- string lengths
- performance
- numeric version of Soundex
- store in an uint16_t (bit fields 5,3,4,4)
- uint16_t soundexN(const char \* str).
- efficient storage of the Soundex array
- encode in nibbles. (13 bytes instead of 26) => more code, performance?
0x01, 0x23, 0x01 etc.
- Other algorithms might be added in the future.
- DaitchMokotoff Soundex
- Beider-Morse Soundex
- Metaphone

View File

@ -0,0 +1,70 @@
//
// FILE: Soundex.cpp
// AUTHOR: Rob Tillaart
// VERSION: 0.1.1
// DATE: 2022-02-05
// PURPOSE: Arduino Library for calculating Soundex hash
// URL: https://github.com/RobTillaart/Soundex
#include "Soundex.h"
Soundex::Soundex()
{
_buffer[0] = '\0';
_length = 4;
}
void Soundex::setLength(uint32_t length)
{
_length = length;
if (_length > (SOUNDEX_MAX_LENGTH - 1))
{
_length = SOUNDEX_MAX_LENGTH - 1;
}
};
char * Soundex::soundex(const char * str)
{
uint8_t i = 0; // index for the buffer.
// fill buffer with zeros
for (i = 0; i < _length; i++) _buffer[i] = '0';
_buffer[_length] = '\0';
// find begin of word, skip spaces, digits
char *p = (char *) &str[0];
while((*p != 0) && (isalpha(*p) == false)) p++;
if (*p == 0) return _buffer;
// handle first character
i = 0;
_buffer[i++] = toupper(*p);
uint8_t last = sdx[_buffer[0] - 'A']; // remember last code
p++;
// process the remainder of the string
while ((*p != 0) && (i < _length))
{
if (isalpha(*p)) // skip non ASCII
{
uint8_t current = sdx[toupper(*p) - 'A'];
// new code?
if (last != current)
{
last = current;
if (last != 0) _buffer[i++] = '0' + last;
}
}
p++;
}
return _buffer;
}
// -- END OF FILE --

View File

@ -0,0 +1,45 @@
#pragma once
//
// FILE: Soundex.h
// AUTHOR: Rob Tillaart
// VERSION: 0.1.1
// DATE: 2022-02-05
// PURPOSE: Arduino Library for calculating Soundex hash
// URL: https://github.com/RobTillaart/Soundex
//
// HISTORY
// 0.1.0 2011-05-20 stand alone application
// 0.1.1 2022-02-05 initial library version
#include "Arduino.h"
#define SOUNDEX_LIB_VERSION (F("0.1.1"))
#define SOUNDEX_MAX_LENGTH 12
class Soundex
{
public:
Soundex();
void setLength(uint32_t length = 4);
uint8_t getLength() { return _length; };
char * soundex(const char * str); // Russel and Odell
private:
char _buffer[SOUNDEX_MAX_LENGTH];
uint8_t _length;
// This array can be made smaller (less RAM)
// - encode in nibbles. (13 bytes iso 26) => more code, performance?
// 0x01, 0x23, 0x01 etc.
uint8_t sdx[26] = {0,1,2,3,0,1,2,0,0,2,2,4,5,5,0,1,2,6,2,3,0,1,0,2,0,2 };
};
// -- END OF FILE --

View File

@ -0,0 +1,85 @@
//
// FILE: soundex_performance.ino
// AUTHOR: Rob Tillaart
// PURPOSE: demo
#include "Arduino.h"
#include "Soundex.h"
Soundex SDX;
uint32_t start, stop, total;
char *token;
uint16_t words;
char str[] = "Lorem ipsum dolor sit amet, \
consectetuer adipiscing elit. Aenean commodo ligula eget dolor. \
Aenean massa. Cum sociis natoque penatibus et magnis dis parturient \
montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, \
pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. \
Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. \
In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. \
Nullam dictum felis eu pede mollis pretium. Integer tincidunt. \
Cras dapibus. Vivamus elementum semper nisi. \
Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu, \
consequat vitae, eleifend ac, enim. Aliquam lorem ante, dapibus in, \
viverra quis, feugiat a, tellus. Phasellus viverra nulla ut metus \
varius laoreet. Quisque rutrum. Aenean imperdiet. Etiam ultricies \
nisi vel augue. Curabitur ullamcorper ultricies nisi. Nam eget dui.";
void setup()
{
Serial.begin(115200);
while (!Serial);
Serial.println();
Serial.print("SOUNDEX_LIB_VERSION: ");
Serial.println(SOUNDEX_LIB_VERSION);
delay(100);
// SOUNDEX PER WORD
token = strtok(str, " ,");
words = 0;
total = 0;
while (token != NULL)
{
start = micros();
char * key = SDX.soundex(token);
stop = micros();
Serial.print(stop - start);
total += (stop - start);
Serial.print("\t");
Serial.print(key);
Serial.print("\t");
Serial.print(token);
Serial.print("\n");
token = strtok(NULL, " ,");
words++;
delay(10);
}
// TOTAL TIME
Serial.print("\nTOTAL: \t");
Serial.print(total);
Serial.print("\t");
Serial.print(words);
Serial.print("\t");
Serial.print(1.0 * words / total, 4);
Serial.print(" per word\n");
delay(10);
Serial.println("\ndone...");
}
void loop()
{
}
// -- END OF FILE --

View File

@ -0,0 +1,53 @@
//
// FILE: soundex_test.ino
// AUTHOR: Rob Tillaart
// PURPOSE: demo
#include "Arduino.h"
#include "Soundex.h"
Soundex SDX;
uint32_t start, stop;
void setup()
{
Serial.begin(115200);
while (!Serial);
Serial.println();
Serial.print("SOUNDEX_LIB_VERSION: ");
Serial.println(SOUNDEX_LIB_VERSION);
delay(100);
start = micros();
char * p = SDX.soundex("soundex");
stop = micros();
Serial.print(p);
Serial.print("\t");
Serial.println(stop - start);
// examples from wikipedia
Serial.println(SDX.soundex("Robert")); // R163
Serial.println(SDX.soundex("Rupert")); // R163
Serial.println(SDX.soundex("Rubin")); // R150
Serial.println(SDX.soundex("Tymczak")); // T522
Serial.println(SDX.soundex("Pfister")); // P236
Serial.println(SDX.soundex("Honeyman")); // H555
SDX.setLength(10);
// e.g. for long chemical names
Serial.println(SDX.soundex("Trichloroethylene")); // T624634500
Serial.println(SDX.soundex("pentacarbon decahydrate")); // P532615323
Serial.println(SDX.soundex("deoxyribonucleic acid")); // D261524223
}
void loop()
{
}
// -- END OF FILE --

View File

@ -0,0 +1,16 @@
# Syntax Colouring Map For Soundex
# Data types (KEYWORD1)
Soundex KEYWORD1
# Methods and Functions (KEYWORD2)
soundex KEYWORD2
setLength KEYWORD2
getLength KEYWORD2
# Constants (LITERAL1)
SOUNDEX_LIB_VERSION LITERAL1

View File

@ -0,0 +1,23 @@
{
"name": "Soundex",
"keywords": "Soundex, hash",
"description": "Arduino Library for soundex.",
"authors":
[
{
"name": "Rob Tillaart",
"email": "Rob.Tillaart@gmail.com",
"maintainer": true
}
],
"repository":
{
"type": "git",
"url": "https://github.com/RobTillaart/Soundex.git"
},
"version": "0.1.1",
"license": "MIT",
"frameworks": "arduino",
"platforms": "*",
"headers": "Soundex.h"
}

View File

@ -0,0 +1,11 @@
name=Soundex
version=0.1.1
author=Rob Tillaart <rob.tillaart@gmail.com>
maintainer=Rob Tillaart <rob.tillaart@gmail.com>
sentence="Arduino Library for calculating Soundex hash.
paragraph=
category=Signal Input/Output
url=https://github.com/RobTillaart/Soundex
architectures=*
includes=Soundex.h
depends=

View File

@ -0,0 +1,88 @@
//
// FILE: unit_test_001.cpp
// AUTHOR: Rob Tillaart
// DATE: 2022-02-05
// PURPOSE: unit tests for the Soundex library
// https://github.com/RobTillaart/Soundex
// https://github.com/Arduino-CI/arduino_ci/blob/master/REFERENCE.md
//
// supported assertions
// https://github.com/Arduino-CI/arduino_ci/blob/master/cpp/unittest/Assertion.h#L33-L42
// ----------------------------
// assertEqual(expected, actual)
// assertNotEqual(expected, actual)
// assertLess(expected, actual)
// assertMore(expected, actual)
// assertLessOrEqual(expected, actual)
// assertMoreOrEqual(expected, actual)
// assertTrue(actual)
// assertFalse(actual)
// assertNull(actual)
// assertNotNull(actual)
#include <ArduinoUnitTests.h>
#include "Arduino.h"
#include "Soundex.h"
unittest_setup()
{
fprintf(stderr, "SOUNDEX_LIB_VERSION: %s\n", (char *) SOUNDEX_LIB_VERSION);
}
unittest_teardown()
{
}
unittest(test_soundex_3)
{
Soundex SDX;
// examples from Wikipedia
assertEqual("R163", SDX.soundex("Robert"));
assertEqual("R163", SDX.soundex("Rupert"));
assertEqual("R150", SDX.soundex("Rubin"));
assertEqual("T522", SDX.soundex("Tymczak"));
assertEqual("P236", SDX.soundex("Pfister"));
assertEqual("H555", SDX.soundex("Honeyman"));
}
unittest(test_soundex_chemicals)
{
Soundex SDX;
SDX.setLength(10);
// e.g. for long chemical names
assertEqual("T624634500", SDX.soundex("Trichloroethylene"));
assertEqual("P532615323", SDX.soundex("pentacarbon decahydrate"));
assertEqual("D261524223", SDX.soundex("deoxyribonucleic acid"));
}
unittest(test_getLength)
{
Soundex SDX;
assertEqual(4, SDX.getLength());
for (int i = 4; i < 12; i++)
{
SDX.setLength(i);
assertEqual(i, SDX.getLength());
}
SDX.setLength(12);
assertEqual(11, SDX.getLength());
}
unittest_main()
// --------