Merge pull request #46 from dscripka/variable_input_size

Variable input size
dscripka · Sep 2, 2023 · d8ed6cb · d8ed6cb
2 parents 690eff8 + ee0a318
commit d8ed6cb
Show file tree

Hide file tree

Showing 4 changed files with 155 additions and 51 deletions.
diff --git a/openwakeword/model.py b/openwakeword/model.py
@@ -97,7 +97,7 @@ def __init__(
                         raise ValueError("Could not find pretrained model for model name '{}'".format(i))
                     else:
                         wakeword_models[ndx] = matching_model[0]
-                        wakeword_model_names.append(matching_model[0].split(os.path.sep)[-1])
+                        wakeword_model_names.append(i)
 
         # Create attributes to store models and metadata
         self.models = {}
@@ -231,9 +231,11 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
         """Predict with all of the wakeword models on the input audio frames
 
         Args:
-            x (Union[ndarray]): The input audio data to predict on with the models. Should be multiples of 80 ms
+            x (ndarray): The input audio data to predict on with the models. Ideally should be multiples of 80 ms
                                 (1280 samples), with longer lengths reducing overall CPU usage
-                                but decreasing detection latency.
+                                but decreasing detection latency. Input audio with durations greater than or less
+                                than 80 ms is also supported, though this will add a detection delay of up to 80 ms
+                                as the appropriate number of samples are accumulated.
             patience (dict): How many consecutive frames (of 1280 samples or 80 ms) above the threshold that must
                              be observed before the current frame will be returned as non-zero.
                              Must be provided as an a dictionary where the keys are the
@@ -251,6 +253,9 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
                   wake-word/wake-phrase detected. If the `timing` argument is true, returns a
                   tuple of dicts containing model predictions and timing information, respectively.
         """
+        # Check input data type
+        if not isinstance(x, np.ndarray):
+            raise ValueError(f"The input audio data (x) must by a Numpy array, instead received an object of type {type(x)}.")
 
         # Setup timing dict
         if timing:
@@ -260,9 +265,9 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
 
         # Get audio features (optionally with Speex noise suppression)
         if self.speex_ns:
-            self.preprocessor(self._suppress_noise_with_speex(x))
+            n_prepared_samples = self.preprocessor(self._suppress_noise_with_speex(x))
         else:
-            self.preprocessor(x)
+            n_prepared_samples = self.preprocessor(x)
 
         if timing:
             timing_dict["models"]["preprocessor"] = time.time() - feature_start
@@ -274,9 +279,9 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
                 model_start = time.time()
 
             # Run model to get predictions
-            if len(x) > 1280:
+            if n_prepared_samples > 1280:
                 group_predictions = []
-                for i in np.arange(len(x)//1280-1, -1, -1):
+                for i in np.arange(n_prepared_samples//1280-1, -1, -1):
                     group_predictions.extend(
                         self.model_prediction_function[mdl](
                             self.preprocessor.get_features(
@@ -286,10 +291,19 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
                         )
                     )
                 prediction = np.array(group_predictions).max(axis=0)[None, ]
-            else:
+            elif n_prepared_samples == 1280:
                 prediction = self.model_prediction_function[mdl](
                     self.preprocessor.get_features(self.model_inputs[mdl])
                 )
+            elif n_prepared_samples < 1280:  # get previous prediction if there aren't enough samples
+                if self.model_outputs[mdl] == 1:
+                    if len(self.prediction_buffer[mdl]) > 0:
+                        prediction = [[[self.prediction_buffer[mdl][-1]]]]
+                    else:
+                        prediction = [[[0]]]
+                elif self.model_outputs[mdl] != 1:
+                    n_classes = max([int(i) for i in self.class_mapping[mdl].keys()])
+                    prediction = [[[0]*(n_classes+1)]]
 
             if self.model_outputs[mdl] == 1:
                 predictions[mdl] = prediction[0][0][0]

diff --git a/openwakeword/utils.py b/openwakeword/utils.py
@@ -162,7 +162,7 @@ def tflite_embedding_predict(x):
         self.melspectrogram_buffer = np.ones((76, 32))  # n_frames x num_features
         self.melspectrogram_max_len = 10*97  # 97 is the number of frames in 1 second of 16hz audio
         self.accumulated_samples = 0  # the samples added to the buffer since the audio preprocessor was last called
-        # self.feature_buffer = np.vstack([self._get_embeddings(np.random.randint(-1000, 1000, 1280).astype(np.int16)) for _ in range(10)])
+        self.raw_data_remainder = np.empty(0)
         self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16))
         self.feature_buffer_max_len = 120  # ~10 seconds of feature buffer history
 
@@ -377,6 +377,9 @@ def _streaming_melspectrogram(self, n_samples):
         clip is calculated. It's unclear if this difference is significant and will impact model performance.
         In particular padding with 0 or very small values seems to demonstrate the differences well.
         """
+        if len(self.raw_data_buffer) < 400:
+            raise ValueError("The number of input frames must be at least 400 samples @ 16khz (25 ms)!")
+
         self.melspectrogram_buffer = np.vstack(
             (self.melspectrogram_buffer, self._get_melspectrogram(list(self.raw_data_buffer)[-n_samples-160*3:]))
         )
@@ -388,21 +391,33 @@ def _buffer_raw_data(self, x):
         """
         Adds raw audio data to the input buffer
         """
-        if len(x) < 400:
-            raise ValueError("The number of input frames must be at least 400 samples @ 16khz (25 ms)!")
         self.raw_data_buffer.extend(x.tolist() if isinstance(x, np.ndarray) else x)
 
     def _streaming_features(self, x):
-        # if len(x) != 1280:
-        #     raise ValueError("You must provide input samples in frames of 1280 samples @ 1600khz."
-        #                      f"Received a frame of {len(x)} samples.")
-
-        # Add raw audio data to buffer
-        self._buffer_raw_data(x)
-        self.accumulated_samples += len(x)
+        # Add raw audio data to buffer, temporarily storing extra frames if not an even number of 80 ms chunks
+        processed_samples = 0
+
+        if self.raw_data_remainder.shape[0] != 0:
+            x = np.concatenate((self.raw_data_remainder, x))
+            self.raw_data_remainder = np.empty(0)
+
+        if self.accumulated_samples + x.shape[0] >= 1280:
+            remainder = (self.accumulated_samples + x.shape[0]) % 1280
+            if remainder != 0:
+                x_even_chunks = x[0:-remainder]
+                self._buffer_raw_data(x_even_chunks)
+                self.accumulated_samples += len(x_even_chunks)
+                self.raw_data_remainder = x[-remainder:]
+            elif remainder == 0:
+                self._buffer_raw_data(x)
+                self.accumulated_samples += x.shape[0]
+                self.raw_data_remainder = np.empty(0)
+        else:
+            self.accumulated_samples += x.shape[0]
+            self._buffer_raw_data(x)
 
-        # Only calculate melspectrogram once minimum samples area accumulated
-        if self.accumulated_samples >= 1280:
+        # Only calculate melspectrogram once minimum samples are accumulated
+        if self.accumulated_samples >= 1280 and self.accumulated_samples % 1280 == 0:
             self._streaming_melspectrogram(self.accumulated_samples)
 
             # Calculate new audio embeddings/features based on update melspectrograms
@@ -415,11 +430,14 @@ def _streaming_features(self, x):
                                                     self.embedding_model_predict(x)))
 
             # Reset raw data buffer counter
+            processed_samples = self.accumulated_samples
             self.accumulated_samples = 0
 
         if self.feature_buffer.shape[0] > self.feature_buffer_max_len:
             self.feature_buffer = self.feature_buffer[-self.feature_buffer_max_len:, :]
 
+        return processed_samples if processed_samples != 0 else self.accumulated_samples
+
     def get_features(self, n_feature_frames: int = 16, start_ndx: int = -1):
         if start_ndx != -1:
             end_ndx = start_ndx + int(n_feature_frames) \
@@ -429,7 +447,7 @@ def get_features(self, n_feature_frames: int = 16, start_ndx: int = -1):
             return self.feature_buffer[int(-1*n_feature_frames):, :][None, ].astype(np.float32)
 
     def __call__(self, x):
-        self._streaming_features(x)
+        return self._streaming_features(x)
 
 
 # Bulk prediction function

diff --git a/setup.py b/setup.py
@@ -40,7 +40,9 @@ def build_additional_requires():
                     'pytest-cov>=2.10.1,<3',
                     'pytest-flake8>=1.1.1,<2',
                     'flake8>=4.0,<4.1',
-                    'pytest-mypy>=0.10.0,<1'
+                    'pytest-mypy>=0.10.0,<1',
+                    'mock>=5.1,<6',
+                    'types-mock>=5.1,<6'
                 ],
         'full': [
                     'mutagen>=1.46.0,<2',

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -29,11 +29,16 @@
 # Imports
 import openwakeword
 import os
+import sys
+import logging
 import numpy as np
 from pathlib import Path
 import collections
 import pytest
 import platform
+import pickle
+import tempfile
+import mock
 
 
 # Tests
@@ -51,8 +56,67 @@ def test_load_models_by_path(self):
         # Prediction on random data
         owwModel.predict(np.random.randint(-1000, 1000, 1280).astype(np.int16))
 
-        # Prediction on random data with different chunk size
-        owwModel.predict(np.random.randint(-1000, 1000, 1280*2).astype(np.int16))
+    def test_predict_with_different_frame_sizes(self):
+        # Test with binary model
+        owwModel1 = openwakeword.Model(wakeword_models=[
+                                        os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")
+                                      ], inference_framework="onnx")
+
+        owwModel2 = openwakeword.Model(wakeword_models=[
+                                        os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")
+                                      ], inference_framework="onnx")
+
+        # Prediction on random data with integer multiples of standard chunk size (1280 samples)
+        predictions1 = owwModel1.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1280)
+        predictions2 = owwModel2.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1280*2)
+        np.testing.assert_approx_equal(max([i['alexa_v0.1'] for i in predictions1]), max([i['alexa_v0.1'] for i in predictions2]), 5)
+
+        # Prediction on data with a chunk size not an integer multiple of 1280
+        predictions1 = owwModel1.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1024)
+        predictions2 = owwModel2.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1024*2)
+        np.testing.assert_approx_equal(max([i['alexa_v0.1'] for i in predictions1]), max([i['alexa_v0.1'] for i in predictions2]), 5)
+
+        # Test with multiclass model
+        owwModel1 = openwakeword.Model(wakeword_models=["timer"], inference_framework="onnx")
+        owwModel2 = openwakeword.Model(wakeword_models=["timer"], inference_framework="onnx")
+
+        # Prediction on random data with integer multiples of standard chunk size (1280 samples)
+        predictions1 = owwModel1.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1280)
+        predictions2 = owwModel2.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1280*2)
+        assert abs(max([i['1_minute_timer'] for i in predictions1]) - max([i['1_minute_timer'] for i in predictions2])) < 0.00001
+
+        # Prediction on data with a chunk size not an integer multiple of 1280
+        predictions1 = owwModel1.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1024)
+        predictions2 = owwModel2.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1024*2)
+        assert abs(max([i['1_minute_timer'] for i in predictions1]) - max([i['1_minute_timer'] for i in predictions2])) < 0.00001
+
+    def test_exception_handling_for_inference_framework(self):
+        with mock.patch.dict(sys.modules, {'onnxruntime': None}):
+            with pytest.raises(ValueError):
+                openwakeword.Model(wakeword_models=[
+                                                os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")
+                                            ], inference_framework="onnx")
+
+        with mock.patch.dict(sys.modules, {'tflite_runtime': None}):
+            openwakeword.Model(wakeword_models=[
+                                            os.path.join("openwakeword", "resources", "models", "alexa_v0.1.tflite")
+                                        ], inference_framework="tflite")
+
+    def test_predict_with_custom_verifier_model(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # Train custom verifier model with random data
+            verifier_model = openwakeword.custom_verifier_model.train_verifier_model(np.random.random((2, 1536)), np.array([0, 1]))
+            pickle.dump(verifier_model, open(os.path.join(tmp_dir, "test_verifier.pkl"), "wb"))
+
+            # Load model with verifier
+            owwModel = openwakeword.Model(
+                wakeword_models=[os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")],
+                inference_framework="onnx",
+                custom_verifier_models={"alexa_v0.1": os.path.join(tmp_dir, "test_verifier.pkl")},
+                custom_verifier_threshold=0.0
+            )
+
+            owwModel.predict(np.random.randint(-1000, 1000, 1280).astype(np.int16))
 
     def test_load_pretrained_model_by_name(self):
         # Load model with defaults
@@ -109,31 +173,37 @@ def test_models_with_speex_noise_cancellation(self):
             assert 1 == 1
         else:
             # Load model with defaults
-            owwModel = openwakeword.Model(enable_speex_noise_suppression=True)
-
-            # Get clips for each model (assumes that test clips will have the model name in the filename)
-            test_dict = {}
-            for mdl_name in owwModel.models.keys():
-                all_clips = [str(i) for i in Path(os.path.join("tests", "data")).glob("*.wav")]
-                test_dict[mdl_name] = [i for i in all_clips if mdl_name in i]
-
-            # Predict
-            for model, clips in test_dict.items():
-                for clip in clips:
-                    # Get predictions for reach frame in the clip
-                    predictions = owwModel.predict_clip(clip)
-                    owwModel.reset()  # reset after each clip to ensure independent results
-
-                    # Make predictions dictionary flatter
-                    predictions_flat = collections.defaultdict(list)
-                    [predictions_flat[key].append(i[key]) for i in predictions for key in i.keys()]
-
-                # Check scores against default threshold (0.5)
-                for key in predictions_flat.keys():
-                    if key in clip:
-                        assert max(predictions_flat[key]) >= 0.5
-                    else:
-                        assert max(predictions_flat[key]) < 0.5
+            try:
+                owwModel = openwakeword.Model(enable_speex_noise_suppression=True)
+
+                # Get clips for each model (assumes that test clips will have the model name in the filename)
+                test_dict = {}
+                for mdl_name in owwModel.models.keys():
+                    all_clips = [str(i) for i in Path(os.path.join("tests", "data")).glob("*.wav")]
+                    test_dict[mdl_name] = [i for i in all_clips if mdl_name in i]
+
+                # Predict
+                for model, clips in test_dict.items():
+                    for clip in clips:
+                        # Get predictions for reach frame in the clip
+                        predictions = owwModel.predict_clip(clip)
+                        owwModel.reset()  # reset after each clip to ensure independent results
+
+                        # Make predictions dictionary flatter
+                        predictions_flat = collections.defaultdict(list)
+                        [predictions_flat[key].append(i[key]) for i in predictions for key in i.keys()]
+
+                    # Check scores against default threshold (0.5)
+                    for key in predictions_flat.keys():
+                        if key in clip:
+                            assert max(predictions_flat[key]) >= 0.5
+                        else:
+                            assert max(predictions_flat[key]) < 0.5
+            except ImportError:
+                logging.warning("Attemped to test Speex noise cancelling functionality, but the 'speexdsp_ns' library was not installed!"
+                                " If you want these tests to be run, install this library as shown in the openwakeword documentation."
+                                )
+                assert 1 == 1
 
     def test_models_with_vad(self):
         # Load model with defaults
@@ -201,8 +271,8 @@ def test_get_parent_model_from_prediction_label(self):
 
     def test_get_positive_prediction_frames(self):
         owwModel = openwakeword.Model(wakeword_models=[
-                                        os.path.join("openwakeword", "resources", "models", "alexa_v0.1.tflite")
-                                      ], inference_framework="tflite")
+                                        os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")
+                                      ], inference_framework="onnx")
 
         clip = os.path.join("tests", "data", "alexa_test.wav")
         features = owwModel._get_positive_prediction_frames(clip)