Skip to content

Commit

Permalink
Merge pull request #46 from dscripka/variable_input_size
Browse files Browse the repository at this point in the history
Variable input size
  • Loading branch information
dscripka authored Sep 2, 2023
2 parents 690eff8 + ee0a318 commit d8ed6cb
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 51 deletions.
30 changes: 22 additions & 8 deletions openwakeword/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def __init__(
raise ValueError("Could not find pretrained model for model name '{}'".format(i))
else:
wakeword_models[ndx] = matching_model[0]
wakeword_model_names.append(matching_model[0].split(os.path.sep)[-1])
wakeword_model_names.append(i)

# Create attributes to store models and metadata
self.models = {}
Expand Down Expand Up @@ -231,9 +231,11 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
"""Predict with all of the wakeword models on the input audio frames
Args:
x (Union[ndarray]): The input audio data to predict on with the models. Should be multiples of 80 ms
x (ndarray): The input audio data to predict on with the models. Ideally should be multiples of 80 ms
(1280 samples), with longer lengths reducing overall CPU usage
but decreasing detection latency.
but decreasing detection latency. Input audio with durations greater than or less
than 80 ms is also supported, though this will add a detection delay of up to 80 ms
as the appropriate number of samples are accumulated.
patience (dict): How many consecutive frames (of 1280 samples or 80 ms) above the threshold that must
be observed before the current frame will be returned as non-zero.
Must be provided as an a dictionary where the keys are the
Expand All @@ -251,6 +253,9 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
wake-word/wake-phrase detected. If the `timing` argument is true, returns a
tuple of dicts containing model predictions and timing information, respectively.
"""
# Check input data type
if not isinstance(x, np.ndarray):
raise ValueError(f"The input audio data (x) must by a Numpy array, instead received an object of type {type(x)}.")

# Setup timing dict
if timing:
Expand All @@ -260,9 +265,9 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi

# Get audio features (optionally with Speex noise suppression)
if self.speex_ns:
self.preprocessor(self._suppress_noise_with_speex(x))
n_prepared_samples = self.preprocessor(self._suppress_noise_with_speex(x))
else:
self.preprocessor(x)
n_prepared_samples = self.preprocessor(x)

if timing:
timing_dict["models"]["preprocessor"] = time.time() - feature_start
Expand All @@ -274,9 +279,9 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
model_start = time.time()

# Run model to get predictions
if len(x) > 1280:
if n_prepared_samples > 1280:
group_predictions = []
for i in np.arange(len(x)//1280-1, -1, -1):
for i in np.arange(n_prepared_samples//1280-1, -1, -1):
group_predictions.extend(
self.model_prediction_function[mdl](
self.preprocessor.get_features(
Expand All @@ -286,10 +291,19 @@ def predict(self, x: np.ndarray, patience: dict = {}, threshold: dict = {}, timi
)
)
prediction = np.array(group_predictions).max(axis=0)[None, ]
else:
elif n_prepared_samples == 1280:
prediction = self.model_prediction_function[mdl](
self.preprocessor.get_features(self.model_inputs[mdl])
)
elif n_prepared_samples < 1280: # get previous prediction if there aren't enough samples
if self.model_outputs[mdl] == 1:
if len(self.prediction_buffer[mdl]) > 0:
prediction = [[[self.prediction_buffer[mdl][-1]]]]
else:
prediction = [[[0]]]
elif self.model_outputs[mdl] != 1:
n_classes = max([int(i) for i in self.class_mapping[mdl].keys()])
prediction = [[[0]*(n_classes+1)]]

if self.model_outputs[mdl] == 1:
predictions[mdl] = prediction[0][0][0]
Expand Down
44 changes: 31 additions & 13 deletions openwakeword/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def tflite_embedding_predict(x):
self.melspectrogram_buffer = np.ones((76, 32)) # n_frames x num_features
self.melspectrogram_max_len = 10*97 # 97 is the number of frames in 1 second of 16hz audio
self.accumulated_samples = 0 # the samples added to the buffer since the audio preprocessor was last called
# self.feature_buffer = np.vstack([self._get_embeddings(np.random.randint(-1000, 1000, 1280).astype(np.int16)) for _ in range(10)])
self.raw_data_remainder = np.empty(0)
self.feature_buffer = self._get_embeddings(np.random.randint(-1000, 1000, 16000*4).astype(np.int16))
self.feature_buffer_max_len = 120 # ~10 seconds of feature buffer history

Expand Down Expand Up @@ -377,6 +377,9 @@ def _streaming_melspectrogram(self, n_samples):
clip is calculated. It's unclear if this difference is significant and will impact model performance.
In particular padding with 0 or very small values seems to demonstrate the differences well.
"""
if len(self.raw_data_buffer) < 400:
raise ValueError("The number of input frames must be at least 400 samples @ 16khz (25 ms)!")

self.melspectrogram_buffer = np.vstack(
(self.melspectrogram_buffer, self._get_melspectrogram(list(self.raw_data_buffer)[-n_samples-160*3:]))
)
Expand All @@ -388,21 +391,33 @@ def _buffer_raw_data(self, x):
"""
Adds raw audio data to the input buffer
"""
if len(x) < 400:
raise ValueError("The number of input frames must be at least 400 samples @ 16khz (25 ms)!")
self.raw_data_buffer.extend(x.tolist() if isinstance(x, np.ndarray) else x)

def _streaming_features(self, x):
# if len(x) != 1280:
# raise ValueError("You must provide input samples in frames of 1280 samples @ 1600khz."
# f"Received a frame of {len(x)} samples.")

# Add raw audio data to buffer
self._buffer_raw_data(x)
self.accumulated_samples += len(x)
# Add raw audio data to buffer, temporarily storing extra frames if not an even number of 80 ms chunks
processed_samples = 0

if self.raw_data_remainder.shape[0] != 0:
x = np.concatenate((self.raw_data_remainder, x))
self.raw_data_remainder = np.empty(0)

if self.accumulated_samples + x.shape[0] >= 1280:
remainder = (self.accumulated_samples + x.shape[0]) % 1280
if remainder != 0:
x_even_chunks = x[0:-remainder]
self._buffer_raw_data(x_even_chunks)
self.accumulated_samples += len(x_even_chunks)
self.raw_data_remainder = x[-remainder:]
elif remainder == 0:
self._buffer_raw_data(x)
self.accumulated_samples += x.shape[0]
self.raw_data_remainder = np.empty(0)
else:
self.accumulated_samples += x.shape[0]
self._buffer_raw_data(x)

# Only calculate melspectrogram once minimum samples area accumulated
if self.accumulated_samples >= 1280:
# Only calculate melspectrogram once minimum samples are accumulated
if self.accumulated_samples >= 1280 and self.accumulated_samples % 1280 == 0:
self._streaming_melspectrogram(self.accumulated_samples)

# Calculate new audio embeddings/features based on update melspectrograms
Expand All @@ -415,11 +430,14 @@ def _streaming_features(self, x):
self.embedding_model_predict(x)))

# Reset raw data buffer counter
processed_samples = self.accumulated_samples
self.accumulated_samples = 0

if self.feature_buffer.shape[0] > self.feature_buffer_max_len:
self.feature_buffer = self.feature_buffer[-self.feature_buffer_max_len:, :]

return processed_samples if processed_samples != 0 else self.accumulated_samples

def get_features(self, n_feature_frames: int = 16, start_ndx: int = -1):
if start_ndx != -1:
end_ndx = start_ndx + int(n_feature_frames) \
Expand All @@ -429,7 +447,7 @@ def get_features(self, n_feature_frames: int = 16, start_ndx: int = -1):
return self.feature_buffer[int(-1*n_feature_frames):, :][None, ].astype(np.float32)

def __call__(self, x):
self._streaming_features(x)
return self._streaming_features(x)


# Bulk prediction function
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ def build_additional_requires():
'pytest-cov>=2.10.1,<3',
'pytest-flake8>=1.1.1,<2',
'flake8>=4.0,<4.1',
'pytest-mypy>=0.10.0,<1'
'pytest-mypy>=0.10.0,<1',
'mock>=5.1,<6',
'types-mock>=5.1,<6'
],
'full': [
'mutagen>=1.46.0,<2',
Expand Down
128 changes: 99 additions & 29 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,16 @@
# Imports
import openwakeword
import os
import sys
import logging
import numpy as np
from pathlib import Path
import collections
import pytest
import platform
import pickle
import tempfile
import mock


# Tests
Expand All @@ -51,8 +56,67 @@ def test_load_models_by_path(self):
# Prediction on random data
owwModel.predict(np.random.randint(-1000, 1000, 1280).astype(np.int16))

# Prediction on random data with different chunk size
owwModel.predict(np.random.randint(-1000, 1000, 1280*2).astype(np.int16))
def test_predict_with_different_frame_sizes(self):
# Test with binary model
owwModel1 = openwakeword.Model(wakeword_models=[
os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")
], inference_framework="onnx")

owwModel2 = openwakeword.Model(wakeword_models=[
os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")
], inference_framework="onnx")

# Prediction on random data with integer multiples of standard chunk size (1280 samples)
predictions1 = owwModel1.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1280)
predictions2 = owwModel2.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1280*2)
np.testing.assert_approx_equal(max([i['alexa_v0.1'] for i in predictions1]), max([i['alexa_v0.1'] for i in predictions2]), 5)

# Prediction on data with a chunk size not an integer multiple of 1280
predictions1 = owwModel1.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1024)
predictions2 = owwModel2.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1024*2)
np.testing.assert_approx_equal(max([i['alexa_v0.1'] for i in predictions1]), max([i['alexa_v0.1'] for i in predictions2]), 5)

# Test with multiclass model
owwModel1 = openwakeword.Model(wakeword_models=["timer"], inference_framework="onnx")
owwModel2 = openwakeword.Model(wakeword_models=["timer"], inference_framework="onnx")

# Prediction on random data with integer multiples of standard chunk size (1280 samples)
predictions1 = owwModel1.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1280)
predictions2 = owwModel2.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1280*2)
assert abs(max([i['1_minute_timer'] for i in predictions1]) - max([i['1_minute_timer'] for i in predictions2])) < 0.00001

# Prediction on data with a chunk size not an integer multiple of 1280
predictions1 = owwModel1.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1024)
predictions2 = owwModel2.predict_clip(os.path.join("tests", "data", "alexa_test.wav"), chunk_size=1024*2)
assert abs(max([i['1_minute_timer'] for i in predictions1]) - max([i['1_minute_timer'] for i in predictions2])) < 0.00001

def test_exception_handling_for_inference_framework(self):
with mock.patch.dict(sys.modules, {'onnxruntime': None}):
with pytest.raises(ValueError):
openwakeword.Model(wakeword_models=[
os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")
], inference_framework="onnx")

with mock.patch.dict(sys.modules, {'tflite_runtime': None}):
openwakeword.Model(wakeword_models=[
os.path.join("openwakeword", "resources", "models", "alexa_v0.1.tflite")
], inference_framework="tflite")

def test_predict_with_custom_verifier_model(self):
with tempfile.TemporaryDirectory() as tmp_dir:
# Train custom verifier model with random data
verifier_model = openwakeword.custom_verifier_model.train_verifier_model(np.random.random((2, 1536)), np.array([0, 1]))
pickle.dump(verifier_model, open(os.path.join(tmp_dir, "test_verifier.pkl"), "wb"))

# Load model with verifier
owwModel = openwakeword.Model(
wakeword_models=[os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")],
inference_framework="onnx",
custom_verifier_models={"alexa_v0.1": os.path.join(tmp_dir, "test_verifier.pkl")},
custom_verifier_threshold=0.0
)

owwModel.predict(np.random.randint(-1000, 1000, 1280).astype(np.int16))

def test_load_pretrained_model_by_name(self):
# Load model with defaults
Expand Down Expand Up @@ -109,31 +173,37 @@ def test_models_with_speex_noise_cancellation(self):
assert 1 == 1
else:
# Load model with defaults
owwModel = openwakeword.Model(enable_speex_noise_suppression=True)

# Get clips for each model (assumes that test clips will have the model name in the filename)
test_dict = {}
for mdl_name in owwModel.models.keys():
all_clips = [str(i) for i in Path(os.path.join("tests", "data")).glob("*.wav")]
test_dict[mdl_name] = [i for i in all_clips if mdl_name in i]

# Predict
for model, clips in test_dict.items():
for clip in clips:
# Get predictions for reach frame in the clip
predictions = owwModel.predict_clip(clip)
owwModel.reset() # reset after each clip to ensure independent results

# Make predictions dictionary flatter
predictions_flat = collections.defaultdict(list)
[predictions_flat[key].append(i[key]) for i in predictions for key in i.keys()]

# Check scores against default threshold (0.5)
for key in predictions_flat.keys():
if key in clip:
assert max(predictions_flat[key]) >= 0.5
else:
assert max(predictions_flat[key]) < 0.5
try:
owwModel = openwakeword.Model(enable_speex_noise_suppression=True)

# Get clips for each model (assumes that test clips will have the model name in the filename)
test_dict = {}
for mdl_name in owwModel.models.keys():
all_clips = [str(i) for i in Path(os.path.join("tests", "data")).glob("*.wav")]
test_dict[mdl_name] = [i for i in all_clips if mdl_name in i]

# Predict
for model, clips in test_dict.items():
for clip in clips:
# Get predictions for reach frame in the clip
predictions = owwModel.predict_clip(clip)
owwModel.reset() # reset after each clip to ensure independent results

# Make predictions dictionary flatter
predictions_flat = collections.defaultdict(list)
[predictions_flat[key].append(i[key]) for i in predictions for key in i.keys()]

# Check scores against default threshold (0.5)
for key in predictions_flat.keys():
if key in clip:
assert max(predictions_flat[key]) >= 0.5
else:
assert max(predictions_flat[key]) < 0.5
except ImportError:
logging.warning("Attemped to test Speex noise cancelling functionality, but the 'speexdsp_ns' library was not installed!"
" If you want these tests to be run, install this library as shown in the openwakeword documentation."
)
assert 1 == 1

def test_models_with_vad(self):
# Load model with defaults
Expand Down Expand Up @@ -201,8 +271,8 @@ def test_get_parent_model_from_prediction_label(self):

def test_get_positive_prediction_frames(self):
owwModel = openwakeword.Model(wakeword_models=[
os.path.join("openwakeword", "resources", "models", "alexa_v0.1.tflite")
], inference_framework="tflite")
os.path.join("openwakeword", "resources", "models", "alexa_v0.1.onnx")
], inference_framework="onnx")

clip = os.path.join("tests", "data", "alexa_test.wav")
features = owwModel._get_positive_prediction_frames(clip)
Expand Down

0 comments on commit d8ed6cb

Please sign in to comment.