-
Notifications
You must be signed in to change notification settings - Fork 4.7k
/
Copy pathdiet_classifier.py
1869 lines (1617 loc) · 70 KB
/
diet_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from __future__ import annotations
import copy
import logging
from collections import defaultdict
from pathlib import Path
from rasa.exceptions import ModelNotFound
from rasa.nlu.featurizers.featurizer import Featurizer
import numpy as np
import scipy.sparse
import tensorflow as tf
from typing import Any, Dict, List, Optional, Text, Tuple, Union, TypeVar, Type
from rasa.engine.graph import ExecutionContext, GraphComponent
from rasa.engine.recipes.default_recipe import DefaultV1Recipe
from rasa.engine.storage.resource import Resource
from rasa.engine.storage.storage import ModelStorage
from rasa.nlu.extractors.extractor import EntityExtractorMixin
from rasa.nlu.classifiers.classifier import IntentClassifier
import rasa.shared.utils.io
import rasa.utils.io as io_utils
import rasa.nlu.utils.bilou_utils as bilou_utils
from rasa.shared.constants import DIAGNOSTIC_DATA
from rasa.nlu.extractors.extractor import EntityTagSpec
from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
from rasa.utils import train_utils
from rasa.utils.tensorflow import rasa_layers
from rasa.utils.tensorflow.models import RasaModel, TransformerRasaModel
from rasa.utils.tensorflow.model_data import (
RasaModelData,
FeatureSignature,
FeatureArray,
)
from rasa.nlu.constants import TOKENS_NAMES, DEFAULT_TRANSFORMER_SIZE
from rasa.shared.nlu.constants import (
SPLIT_ENTITIES_BY_COMMA_DEFAULT_VALUE,
TEXT,
INTENT,
INTENT_RESPONSE_KEY,
ENTITIES,
ENTITY_ATTRIBUTE_TYPE,
ENTITY_ATTRIBUTE_GROUP,
ENTITY_ATTRIBUTE_ROLE,
NO_ENTITY_TAG,
SPLIT_ENTITIES_BY_COMMA,
)
from rasa.shared.exceptions import InvalidConfigException
from rasa.shared.nlu.training_data.training_data import TrainingData
from rasa.shared.nlu.training_data.message import Message
from rasa.utils.tensorflow.constants import (
LABEL,
IDS,
HIDDEN_LAYERS_SIZES,
RENORMALIZE_CONFIDENCES,
SHARE_HIDDEN_LAYERS,
TRANSFORMER_SIZE,
NUM_TRANSFORMER_LAYERS,
NUM_HEADS,
BATCH_SIZES,
BATCH_STRATEGY,
EPOCHS,
RANDOM_SEED,
LEARNING_RATE,
RANKING_LENGTH,
LOSS_TYPE,
SIMILARITY_TYPE,
NUM_NEG,
SPARSE_INPUT_DROPOUT,
DENSE_INPUT_DROPOUT,
MASKED_LM,
ENTITY_RECOGNITION,
TENSORBOARD_LOG_DIR,
INTENT_CLASSIFICATION,
EVAL_NUM_EXAMPLES,
EVAL_NUM_EPOCHS,
UNIDIRECTIONAL_ENCODER,
DROP_RATE,
DROP_RATE_ATTENTION,
CONNECTION_DENSITY,
NEGATIVE_MARGIN_SCALE,
REGULARIZATION_CONSTANT,
SCALE_LOSS,
USE_MAX_NEG_SIM,
MAX_NEG_SIM,
MAX_POS_SIM,
EMBEDDING_DIMENSION,
BILOU_FLAG,
KEY_RELATIVE_ATTENTION,
VALUE_RELATIVE_ATTENTION,
MAX_RELATIVE_POSITION,
AUTO,
BALANCED,
CROSS_ENTROPY,
TENSORBOARD_LOG_LEVEL,
CONCAT_DIMENSION,
FEATURIZERS,
CHECKPOINT_MODEL,
SEQUENCE,
SENTENCE,
SEQUENCE_LENGTH,
DENSE_DIMENSION,
MASK,
CONSTRAIN_SIMILARITIES,
MODEL_CONFIDENCE,
SOFTMAX,
RUN_EAGERLY,
)
logger = logging.getLogger(__name__)
SPARSE = "sparse"
DENSE = "dense"
LABEL_KEY = LABEL
LABEL_SUB_KEY = IDS
POSSIBLE_TAGS = [ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_ROLE, ENTITY_ATTRIBUTE_GROUP]
DIETClassifierT = TypeVar("DIETClassifierT", bound="DIETClassifier")
@DefaultV1Recipe.register(
[
DefaultV1Recipe.ComponentType.INTENT_CLASSIFIER,
DefaultV1Recipe.ComponentType.ENTITY_EXTRACTOR,
],
is_trainable=True,
)
class DIETClassifier(GraphComponent, IntentClassifier, EntityExtractorMixin):
"""A multi-task model for intent classification and entity extraction.
DIET is Dual Intent and Entity Transformer.
The architecture is based on a transformer which is shared for both tasks.
A sequence of entity labels is predicted through a Conditional Random Field (CRF)
tagging layer on top of the transformer output sequence corresponding to the
input sequence of tokens. The transformer output for the ``__CLS__`` token and
intent labels are embedded into a single semantic vector space. We use the
dot-product loss to maximize the similarity with the target label and minimize
similarities with negative samples.
"""
@classmethod
def required_components(cls) -> List[Type]:
"""Components that should be included in the pipeline before this component."""
return [Featurizer]
@staticmethod
def get_default_config() -> Dict[Text, Any]:
"""The component's default config (see parent class for full docstring)."""
# please make sure to update the docs when changing a default parameter
return {
# ## Architecture of the used neural network
# Hidden layer sizes for layers before the embedding layers for user message
# and labels.
# The number of hidden layers is equal to the length of the corresponding
# list.
HIDDEN_LAYERS_SIZES: {TEXT: [], LABEL: []},
# Whether to share the hidden layer weights between user message and labels.
SHARE_HIDDEN_LAYERS: False,
# Number of units in transformer
TRANSFORMER_SIZE: DEFAULT_TRANSFORMER_SIZE,
# Number of transformer layers
NUM_TRANSFORMER_LAYERS: 2,
# Number of attention heads in transformer
NUM_HEADS: 4,
# If 'True' use key relative embeddings in attention
KEY_RELATIVE_ATTENTION: False,
# If 'True' use value relative embeddings in attention
VALUE_RELATIVE_ATTENTION: False,
# Max position for relative embeddings. Only in effect if key- or value
# relative attention are turned on
MAX_RELATIVE_POSITION: 5,
# Use a unidirectional or bidirectional encoder.
UNIDIRECTIONAL_ENCODER: False,
# ## Training parameters
# Initial and final batch sizes:
# Batch size will be linearly increased for each epoch.
BATCH_SIZES: [64, 256],
# Strategy used when creating batches.
# Can be either 'sequence' or 'balanced'.
BATCH_STRATEGY: BALANCED,
# Number of epochs to train
EPOCHS: 300,
# Set random seed to any 'int' to get reproducible results
RANDOM_SEED: None,
# Initial learning rate for the optimizer
LEARNING_RATE: 0.001,
# ## Parameters for embeddings
# Dimension size of embedding vectors
EMBEDDING_DIMENSION: 20,
# Dense dimension to use for sparse features.
DENSE_DIMENSION: {TEXT: 128, LABEL: 20},
# Default dimension to use for concatenating sequence and sentence features.
CONCAT_DIMENSION: {TEXT: 128, LABEL: 20},
# The number of incorrect labels. The algorithm will minimize
# their similarity to the user input during training.
NUM_NEG: 20,
# Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
SIMILARITY_TYPE: AUTO,
# The type of the loss function, either 'cross_entropy' or 'margin'.
LOSS_TYPE: CROSS_ENTROPY,
# Number of top intents for which confidences should be reported.
# Set to 0 if confidences for all intents should be reported.
RANKING_LENGTH: LABEL_RANKING_LENGTH,
# Indicates how similar the algorithm should try to make embedding vectors
# for correct labels.
# Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
MAX_POS_SIM: 0.8,
# Maximum negative similarity for incorrect labels.
# Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
MAX_NEG_SIM: -0.4,
# If 'True' the algorithm only minimizes maximum similarity over
# incorrect intent labels, used only if 'loss_type' is set to 'margin'.
USE_MAX_NEG_SIM: True,
# If 'True' scale loss inverse proportionally to the confidence
# of the correct prediction
SCALE_LOSS: False,
# ## Regularization parameters
# The scale of regularization
REGULARIZATION_CONSTANT: 0.002,
# The scale of how important is to minimize the maximum similarity
# between embeddings of different labels,
# used only if 'loss_type' is set to 'margin'.
NEGATIVE_MARGIN_SCALE: 0.8,
# Dropout rate for encoder
DROP_RATE: 0.2,
# Dropout rate for attention
DROP_RATE_ATTENTION: 0,
# Fraction of trainable weights in internal layers.
CONNECTION_DENSITY: 0.2,
# If 'True' apply dropout to sparse input tensors
SPARSE_INPUT_DROPOUT: True,
# If 'True' apply dropout to dense input tensors
DENSE_INPUT_DROPOUT: True,
# ## Evaluation parameters
# How often calculate validation accuracy.
# Small values may hurt performance.
EVAL_NUM_EPOCHS: 20,
# How many examples to use for hold out validation set
# Large values may hurt performance, e.g. model accuracy.
# Set to 0 for no validation.
EVAL_NUM_EXAMPLES: 0,
# ## Model config
# If 'True' intent classification is trained and intent predicted.
INTENT_CLASSIFICATION: True,
# If 'True' named entity recognition is trained and entities predicted.
ENTITY_RECOGNITION: True,
# If 'True' random tokens of the input message will be masked and the model
# should predict those tokens.
MASKED_LM: False,
# 'BILOU_flag' determines whether to use BILOU tagging or not.
# If set to 'True' labelling is more rigorous, however more
# examples per entity are required.
# Rule of thumb: you should have more than 100 examples per entity.
BILOU_FLAG: True,
# If you want to use tensorboard to visualize training and validation
# metrics, set this option to a valid output directory.
TENSORBOARD_LOG_DIR: None,
# Define when training metrics for tensorboard should be logged.
# Either after every epoch or for every training step.
# Valid values: 'epoch' and 'batch'
TENSORBOARD_LOG_LEVEL: "epoch",
# Perform model checkpointing
CHECKPOINT_MODEL: False,
# Specify what features to use as sequence and sentence features
# By default all features in the pipeline are used.
FEATURIZERS: [],
# Split entities by comma, this makes sense e.g. for a list of ingredients
# in a recipie, but it doesn't make sense for the parts of an address
SPLIT_ENTITIES_BY_COMMA: True,
# If 'True' applies sigmoid on all similarity terms and adds
# it to the loss function to ensure that similarity values are
# approximately bounded. Used inside cross-entropy loss only.
CONSTRAIN_SIMILARITIES: False,
# Model confidence to be returned during inference. Currently, the only
# possible value is `softmax`.
MODEL_CONFIDENCE: SOFTMAX,
# Determines whether the confidences of the chosen top intents should be
# renormalized so that they sum up to 1. By default, we do not renormalize
# and return the confidences for the top intents as is.
# Note that renormalization only makes sense if confidences are generated
# via `softmax`.
RENORMALIZE_CONFIDENCES: False,
# Determines whether to construct the model graph or not.
# This is advantageous when the model is only trained or inferred for
# a few steps, as the compilation of the graph tends to take more time than
# running it. It is recommended to not adjust the optimization parameter.
RUN_EAGERLY: False,
}
def __init__(
self,
config: Dict[Text, Any],
model_storage: ModelStorage,
resource: Resource,
execution_context: ExecutionContext,
index_label_id_mapping: Optional[Dict[int, Text]] = None,
entity_tag_specs: Optional[List[EntityTagSpec]] = None,
model: Optional[RasaModel] = None,
sparse_feature_sizes: Optional[Dict[Text, Dict[Text, List[int]]]] = None,
) -> None:
"""Declare instance variables with default values."""
if EPOCHS not in config:
rasa.shared.utils.io.raise_warning(
f"Please configure the number of '{EPOCHS}' in your configuration file."
f" We will change the default value of '{EPOCHS}' in the future to 1. "
)
self.component_config = config
self._model_storage = model_storage
self._resource = resource
self._execution_context = execution_context
self._check_config_parameters()
# transform numbers to labels
self.index_label_id_mapping = index_label_id_mapping or {}
self._entity_tag_specs = entity_tag_specs
self.model = model
self.tmp_checkpoint_dir = None
if self.component_config[CHECKPOINT_MODEL]:
self.tmp_checkpoint_dir = Path(rasa.utils.io.create_temporary_directory())
self._label_data: Optional[RasaModelData] = None
self._data_example: Optional[Dict[Text, Dict[Text, List[FeatureArray]]]] = None
self.split_entities_config = rasa.utils.train_utils.init_split_entities(
self.component_config[SPLIT_ENTITIES_BY_COMMA],
SPLIT_ENTITIES_BY_COMMA_DEFAULT_VALUE,
)
self.finetune_mode = self._execution_context.is_finetuning
self._sparse_feature_sizes = sparse_feature_sizes
# init helpers
def _check_masked_lm(self) -> None:
if (
self.component_config[MASKED_LM]
and self.component_config[NUM_TRANSFORMER_LAYERS] == 0
):
raise ValueError(
f"If number of transformer layers is 0, "
f"'{MASKED_LM}' option should be 'False'."
)
def _check_share_hidden_layers_sizes(self) -> None:
if self.component_config.get(SHARE_HIDDEN_LAYERS):
first_hidden_layer_sizes = next(
iter(self.component_config[HIDDEN_LAYERS_SIZES].values())
)
# check that all hidden layer sizes are the same
identical_hidden_layer_sizes = all(
current_hidden_layer_sizes == first_hidden_layer_sizes
for current_hidden_layer_sizes in self.component_config[
HIDDEN_LAYERS_SIZES
].values()
)
if not identical_hidden_layer_sizes:
raise ValueError(
f"If hidden layer weights are shared, "
f"{HIDDEN_LAYERS_SIZES} must coincide."
)
def _check_config_parameters(self) -> None:
self.component_config = train_utils.check_deprecated_options(
self.component_config
)
self._check_masked_lm()
self._check_share_hidden_layers_sizes()
self.component_config = train_utils.update_confidence_type(
self.component_config
)
train_utils.validate_configuration_settings(self.component_config)
self.component_config = train_utils.update_similarity_type(
self.component_config
)
self.component_config = train_utils.update_evaluation_parameters(
self.component_config
)
@classmethod
def create(
cls,
config: Dict[Text, Any],
model_storage: ModelStorage,
resource: Resource,
execution_context: ExecutionContext,
) -> DIETClassifier:
"""Creates a new untrained component (see parent class for full docstring)."""
return cls(config, model_storage, resource, execution_context)
@property
def label_key(self) -> Optional[Text]:
"""Return key if intent classification is activated."""
return LABEL_KEY if self.component_config[INTENT_CLASSIFICATION] else None
@property
def label_sub_key(self) -> Optional[Text]:
"""Return sub key if intent classification is activated."""
return LABEL_SUB_KEY if self.component_config[INTENT_CLASSIFICATION] else None
@staticmethod
def model_class() -> Type[RasaModel]:
return DIET
# training data helpers:
@staticmethod
def _label_id_index_mapping(
training_data: TrainingData, attribute: Text
) -> Dict[Text, int]:
"""Create label_id dictionary."""
distinct_label_ids = {
example.get(attribute) for example in training_data.intent_examples
} - {None}
return {
label_id: idx for idx, label_id in enumerate(sorted(distinct_label_ids))
}
@staticmethod
def _invert_mapping(mapping: Dict) -> Dict:
return {value: key for key, value in mapping.items()}
def _create_entity_tag_specs(
self, training_data: TrainingData
) -> List[EntityTagSpec]:
"""Create entity tag specifications with their respective tag id mappings."""
_tag_specs = []
for tag_name in POSSIBLE_TAGS:
if self.component_config[BILOU_FLAG]:
tag_id_index_mapping = bilou_utils.build_tag_id_dict(
training_data, tag_name
)
else:
tag_id_index_mapping = self._tag_id_index_mapping_for(
tag_name, training_data
)
if tag_id_index_mapping:
_tag_specs.append(
EntityTagSpec(
tag_name=tag_name,
tags_to_ids=tag_id_index_mapping,
ids_to_tags=self._invert_mapping(tag_id_index_mapping),
num_tags=len(tag_id_index_mapping),
)
)
return _tag_specs
@staticmethod
def _tag_id_index_mapping_for(
tag_name: Text, training_data: TrainingData
) -> Optional[Dict[Text, int]]:
"""Create mapping from tag name to id."""
if tag_name == ENTITY_ATTRIBUTE_ROLE:
distinct_tags = training_data.entity_roles
elif tag_name == ENTITY_ATTRIBUTE_GROUP:
distinct_tags = training_data.entity_groups
else:
distinct_tags = training_data.entities
distinct_tags = distinct_tags - {NO_ENTITY_TAG} - {None}
if not distinct_tags:
return None
tag_id_dict = {
tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tags), 1)
}
# NO_ENTITY_TAG corresponds to non-entity which should correspond to 0 index
# needed for correct prediction for padding
tag_id_dict[NO_ENTITY_TAG] = 0
return tag_id_dict
@staticmethod
def _find_example_for_label(
label: Text, examples: List[Message], attribute: Text
) -> Optional[Message]:
for ex in examples:
if ex.get(attribute) == label:
return ex
return None
def _check_labels_features_exist(
self, labels_example: List[Message], attribute: Text
) -> bool:
"""Checks if all labels have features set."""
return all(
label_example.features_present(
attribute, self.component_config[FEATURIZERS]
)
for label_example in labels_example
)
def _extract_features(
self, message: Message, attribute: Text
) -> Dict[Text, Union[scipy.sparse.spmatrix, np.ndarray]]:
(
sparse_sequence_features,
sparse_sentence_features,
) = message.get_sparse_features(attribute, self.component_config[FEATURIZERS])
dense_sequence_features, dense_sentence_features = message.get_dense_features(
attribute, self.component_config[FEATURIZERS]
)
if dense_sequence_features is not None and sparse_sequence_features is not None:
if (
dense_sequence_features.features.shape[0]
!= sparse_sequence_features.features.shape[0]
):
raise ValueError(
f"Sequence dimensions for sparse and dense sequence features "
f"don't coincide in '{message.get(TEXT)}'"
f"for attribute '{attribute}'."
)
if dense_sentence_features is not None and sparse_sentence_features is not None:
if (
dense_sentence_features.features.shape[0]
!= sparse_sentence_features.features.shape[0]
):
raise ValueError(
f"Sequence dimensions for sparse and dense sentence features "
f"don't coincide in '{message.get(TEXT)}'"
f"for attribute '{attribute}'."
)
# If we don't use the transformer and we don't want to do entity recognition,
# to speed up training take only the sentence features as feature vector.
# We would not make use of the sequence anyway in this setup. Carrying over
# those features to the actual training process takes quite some time.
if (
self.component_config[NUM_TRANSFORMER_LAYERS] == 0
and not self.component_config[ENTITY_RECOGNITION]
and attribute not in [INTENT, INTENT_RESPONSE_KEY]
):
sparse_sequence_features = None
dense_sequence_features = None
out = {}
if sparse_sentence_features is not None:
out[f"{SPARSE}_{SENTENCE}"] = sparse_sentence_features.features
if sparse_sequence_features is not None:
out[f"{SPARSE}_{SEQUENCE}"] = sparse_sequence_features.features
if dense_sentence_features is not None:
out[f"{DENSE}_{SENTENCE}"] = dense_sentence_features.features
if dense_sequence_features is not None:
out[f"{DENSE}_{SEQUENCE}"] = dense_sequence_features.features
return out
def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None:
"""Checks if features have same dimensionality if hidden layers are shared."""
if self.component_config.get(SHARE_HIDDEN_LAYERS):
num_text_sentence_features = model_data.number_of_units(TEXT, SENTENCE)
num_label_sentence_features = model_data.number_of_units(LABEL, SENTENCE)
num_text_sequence_features = model_data.number_of_units(TEXT, SEQUENCE)
num_label_sequence_features = model_data.number_of_units(LABEL, SEQUENCE)
if (0 < num_text_sentence_features != num_label_sentence_features > 0) or (
0 < num_text_sequence_features != num_label_sequence_features > 0
):
raise ValueError(
"If embeddings are shared text features and label features "
"must coincide. Check the output dimensions of previous components."
)
def _extract_labels_precomputed_features(
self, label_examples: List[Message], attribute: Text = INTENT
) -> Tuple[List[FeatureArray], List[FeatureArray]]:
"""Collects precomputed encodings."""
features = defaultdict(list)
for e in label_examples:
label_features = self._extract_features(e, attribute)
for feature_key, feature_value in label_features.items():
features[feature_key].append(feature_value)
sequence_features = []
sentence_features = []
for feature_name, feature_value in features.items():
if SEQUENCE in feature_name:
sequence_features.append(
FeatureArray(np.array(feature_value), number_of_dimensions=3)
)
else:
sentence_features.append(
FeatureArray(np.array(feature_value), number_of_dimensions=3)
)
return sequence_features, sentence_features
@staticmethod
def _compute_default_label_features(
labels_example: List[Message],
) -> List[FeatureArray]:
"""Computes one-hot representation for the labels."""
logger.debug("No label features found. Computing default label features.")
eye_matrix = np.eye(len(labels_example), dtype=np.float32)
# add sequence dimension to one-hot labels
return [
FeatureArray(
np.array([np.expand_dims(a, 0) for a in eye_matrix]),
number_of_dimensions=3,
)
]
def _create_label_data(
self,
training_data: TrainingData,
label_id_dict: Dict[Text, int],
attribute: Text,
) -> RasaModelData:
"""Create matrix with label_ids encoded in rows as bag of words.
Find a training example for each label and get the encoded features
from the corresponding Message object.
If the features are already computed, fetch them from the message object
else compute a one hot encoding for the label as the feature vector.
"""
# Collect one example for each label
labels_idx_examples = []
for label_name, idx in label_id_dict.items():
label_example = self._find_example_for_label(
label_name, training_data.intent_examples, attribute
)
labels_idx_examples.append((idx, label_example))
# Sort the list of tuples based on label_idx
labels_idx_examples = sorted(labels_idx_examples, key=lambda x: x[0])
labels_example = [example for (_, example) in labels_idx_examples]
# Collect features, precomputed if they exist, else compute on the fly
if self._check_labels_features_exist(labels_example, attribute):
(
sequence_features,
sentence_features,
) = self._extract_labels_precomputed_features(labels_example, attribute)
else:
sequence_features = None
sentence_features = self._compute_default_label_features(labels_example)
label_data = RasaModelData()
label_data.add_features(LABEL, SEQUENCE, sequence_features)
label_data.add_features(LABEL, SENTENCE, sentence_features)
if label_data.does_feature_not_exist(
LABEL, SENTENCE
) and label_data.does_feature_not_exist(LABEL, SEQUENCE):
raise ValueError(
"No label features are present. Please check your configuration file."
)
label_ids = np.array([idx for (idx, _) in labels_idx_examples])
# explicitly add last dimension to label_ids
# to track correctly dynamic sequences
label_data.add_features(
LABEL_KEY,
LABEL_SUB_KEY,
[
FeatureArray(
np.expand_dims(label_ids, -1),
number_of_dimensions=2,
)
],
)
label_data.add_lengths(LABEL, SEQUENCE_LENGTH, LABEL, SEQUENCE)
return label_data
def _use_default_label_features(self, label_ids: np.ndarray) -> List[FeatureArray]:
if self._label_data is None:
return []
feature_arrays = self._label_data.get(LABEL, SENTENCE)
all_label_features = feature_arrays[0]
return [
FeatureArray(
np.array([all_label_features[label_id] for label_id in label_ids]),
number_of_dimensions=all_label_features.number_of_dimensions,
)
]
def _create_model_data(
self,
training_data: List[Message],
label_id_dict: Optional[Dict[Text, int]] = None,
label_attribute: Optional[Text] = None,
training: bool = True,
) -> RasaModelData:
"""Prepare data for training and create a RasaModelData object."""
from rasa.utils.tensorflow import model_data_utils
attributes_to_consider = [TEXT]
if training and self.component_config[INTENT_CLASSIFICATION]:
# we don't have any intent labels during prediction, just add them during
# training
attributes_to_consider.append(label_attribute)
if (
training
and self.component_config[ENTITY_RECOGNITION]
and self._entity_tag_specs
):
# Add entities as labels only during training and only if there was
# training data added for entities with DIET configured to predict entities.
attributes_to_consider.append(ENTITIES)
if training and label_attribute is not None:
# only use those training examples that have the label_attribute set
# during training
training_data = [
example for example in training_data if label_attribute in example.data
]
training_data = [
message
for message in training_data
if message.features_present(
attribute=TEXT, featurizers=self.component_config.get(FEATURIZERS)
)
]
if not training_data:
# no training data are present to train
return RasaModelData()
(
features_for_examples,
sparse_feature_sizes,
) = model_data_utils.featurize_training_examples(
training_data,
attributes_to_consider,
entity_tag_specs=self._entity_tag_specs,
featurizers=self.component_config[FEATURIZERS],
bilou_tagging=self.component_config[BILOU_FLAG],
)
attribute_data, _ = model_data_utils.convert_to_data_format(
features_for_examples, consider_dialogue_dimension=False
)
model_data = RasaModelData(
label_key=self.label_key, label_sub_key=self.label_sub_key
)
model_data.add_data(attribute_data)
model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
# Current implementation doesn't yet account for updating sparse
# feature sizes of label attributes. That's why we remove them.
sparse_feature_sizes = self._remove_label_sparse_feature_sizes(
sparse_feature_sizes=sparse_feature_sizes, label_attribute=label_attribute
)
model_data.add_sparse_feature_sizes(sparse_feature_sizes)
self._add_label_features(
model_data, training_data, label_attribute, label_id_dict, training
)
# make sure all keys are in the same order during training and prediction
# as we rely on the order of key and sub-key when constructing the actual
# tensors from the model data
model_data.sort()
return model_data
@staticmethod
def _remove_label_sparse_feature_sizes(
sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]],
label_attribute: Optional[Text] = None,
) -> Dict[Text, Dict[Text, List[int]]]:
if label_attribute in sparse_feature_sizes:
del sparse_feature_sizes[label_attribute]
return sparse_feature_sizes
def _add_label_features(
self,
model_data: RasaModelData,
training_data: List[Message],
label_attribute: Text,
label_id_dict: Dict[Text, int],
training: bool = True,
) -> None:
label_ids = []
if training and self.component_config[INTENT_CLASSIFICATION]:
for example in training_data:
if example.get(label_attribute):
label_ids.append(label_id_dict[example.get(label_attribute)])
# explicitly add last dimension to label_ids
# to track correctly dynamic sequences
model_data.add_features(
LABEL_KEY,
LABEL_SUB_KEY,
[
FeatureArray(
np.expand_dims(label_ids, -1),
number_of_dimensions=2,
)
],
)
if (
label_attribute
and model_data.does_feature_not_exist(label_attribute, SENTENCE)
and model_data.does_feature_not_exist(label_attribute, SEQUENCE)
):
# no label features are present, get default features from _label_data
model_data.add_features(
LABEL, SENTENCE, self._use_default_label_features(np.array(label_ids))
)
# as label_attribute can have different values, e.g. INTENT or RESPONSE,
# copy over the features to the LABEL key to make
# it easier to access the label features inside the model itself
model_data.update_key(label_attribute, SENTENCE, LABEL, SENTENCE)
model_data.update_key(label_attribute, SEQUENCE, LABEL, SEQUENCE)
model_data.update_key(label_attribute, MASK, LABEL, MASK)
model_data.add_lengths(LABEL, SEQUENCE_LENGTH, LABEL, SEQUENCE)
# train helpers
def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
"""Prepares data for training.
Performs sanity checks on training data, extracts encodings for labels.
"""
if (
self.component_config[BILOU_FLAG]
and self.component_config[ENTITY_RECOGNITION]
):
bilou_utils.apply_bilou_schema(training_data)
label_id_index_mapping = self._label_id_index_mapping(
training_data, attribute=INTENT
)
if not label_id_index_mapping:
# no labels are present to train
return RasaModelData()
self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)
self._label_data = self._create_label_data(
training_data, label_id_index_mapping, attribute=INTENT
)
self._entity_tag_specs = self._create_entity_tag_specs(training_data)
label_attribute = (
INTENT if self.component_config[INTENT_CLASSIFICATION] else None
)
model_data = self._create_model_data(
training_data.nlu_examples,
label_id_index_mapping,
label_attribute=label_attribute,
)
self._check_input_dimension_consistency(model_data)
return model_data
@staticmethod
def _check_enough_labels(model_data: RasaModelData) -> bool:
return len(np.unique(model_data.get(LABEL_KEY, LABEL_SUB_KEY))) >= 2
def train(self, training_data: TrainingData) -> Resource:
"""Train the embedding intent classifier on a data set."""
model_data = self.preprocess_train_data(training_data)
if model_data.is_empty():
logger.debug(
f"Cannot train '{self.__class__.__name__}'. No data was provided. "
f"Skipping training of the classifier."
)
return self._resource
if not self.model and self.finetune_mode:
raise rasa.shared.exceptions.InvalidParameterException(
f"{self.__class__.__name__} was instantiated "
f"with `model=None` and `finetune_mode=True`. "
f"This is not a valid combination as the component "
f"needs an already instantiated and trained model "
f"to continue training in finetune mode."
)
if self.component_config.get(INTENT_CLASSIFICATION):
if not self._check_enough_labels(model_data):
logger.error(
f"Cannot train '{self.__class__.__name__}'. "
f"Need at least 2 different intent classes. "
f"Skipping training of classifier."
)
return self._resource
if self.component_config.get(ENTITY_RECOGNITION):
self.check_correct_entity_annotations(training_data)
# keep one example for persisting and loading
self._data_example = model_data.first_data_example()
if not self.finetune_mode:
# No pre-trained model to load from. Create a new instance of the model.
self.model = self._instantiate_model_class(model_data)
self.model.compile(
optimizer=tf.keras.optimizers.Adam(
self.component_config[LEARNING_RATE]
),
run_eagerly=self.component_config[RUN_EAGERLY],
)
else:
if self.model is None:
raise ModelNotFound("Model could not be found. ")
self.model.adjust_for_incremental_training(
data_example=self._data_example,
new_sparse_feature_sizes=model_data.get_sparse_feature_sizes(),
old_sparse_feature_sizes=self._sparse_feature_sizes,
)
self._sparse_feature_sizes = model_data.get_sparse_feature_sizes()
data_generator, validation_data_generator = train_utils.create_data_generators(
model_data,
self.component_config[BATCH_SIZES],
self.component_config[EPOCHS],
self.component_config[BATCH_STRATEGY],
self.component_config[EVAL_NUM_EXAMPLES],
self.component_config[RANDOM_SEED],
)
callbacks = train_utils.create_common_callbacks(
self.component_config[EPOCHS],
self.component_config[TENSORBOARD_LOG_DIR],
self.component_config[TENSORBOARD_LOG_LEVEL],
self.tmp_checkpoint_dir,
)
self.model.fit(
data_generator,
epochs=self.component_config[EPOCHS],
validation_data=validation_data_generator,
validation_freq=self.component_config[EVAL_NUM_EPOCHS],
callbacks=callbacks,
verbose=False,
shuffle=False, # we use custom shuffle inside data generator
)
self.persist()
return self._resource
# process helpers
def _predict(
self, message: Message
) -> Optional[Dict[Text, Union[tf.Tensor, Dict[Text, tf.Tensor]]]]:
if self.model is None:
logger.debug(
f"There is no trained model for '{self.__class__.__name__}': The "
f"component is either not trained or didn't receive enough training "
f"data."
)
return None
# create session data from message and convert it into a batch of 1
model_data = self._create_model_data([message], training=False)
if model_data.is_empty():
return None
return self.model.run_inference(model_data)
def _predict_label(
self, predict_out: Optional[Dict[Text, tf.Tensor]]
) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
"""Predicts the intent of the provided message."""
label: Dict[Text, Any] = {"name": None, "confidence": 0.0}
label_ranking: List[Dict[Text, Any]] = []
if predict_out is None:
return label, label_ranking
message_sim = predict_out["i_scores"]
message_sim = message_sim.flatten() # sim is a matrix
# if X contains all zeros do not predict some label
if message_sim.size == 0:
return label, label_ranking
# rank the confidences
ranking_length = self.component_config[RANKING_LENGTH]
renormalize = (
self.component_config[RENORMALIZE_CONFIDENCES]
and self.component_config[MODEL_CONFIDENCE] == SOFTMAX
)
ranked_label_indices, message_sim = train_utils.rank_and_mask(
message_sim, ranking_length=ranking_length, renormalize=renormalize
)