diff --git a/changelog/4899.improvement.rst b/changelog/4899.improvement.rst new file mode 100644 index 000000000000..98477379aca5 --- /dev/null +++ b/changelog/4899.improvement.rst @@ -0,0 +1,4 @@ +The ``intent_report.json`` created by ``rasa test`` now creates an extra field +``confused_with`` for each intent. This is a dictionary containing the names of +the most common false positives when this intent should be predicted, and the +number of such false positives. \ No newline at end of file diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py index 5a7b97ac7b0e..33e3b24eda10 100644 --- a/rasa/nlu/test.py +++ b/rasa/nlu/test.py @@ -6,6 +6,7 @@ from tqdm import tqdm from typing import ( Iterable, + Collection, Iterator, Tuple, List, @@ -398,6 +399,41 @@ def evaluate_response_selections( } +def _add_confused_intents_to_report( + report: Dict[Text, Dict[Text, float]], + cnf_matrix: np.ndarray, + labels: Collection[Text], +) -> Dict[Text, Dict[Text, Union[Dict, float]]]: + """Adds a field "confused_with" to the intents in the + intent evaluation report. The value is a dict of + {"false_positive_label": false_positive_count} pairs. + If there are no false positives in the confusion matrix, + the dict will be empty. Typically we include the two most + commonly false positive labels, three in the rare case that + the diagonal element in the confusion matrix is not one of the + three highest values in the row. + """ + + # sort confusion matrix by false positives + indices = np.argsort(cnf_matrix, axis=1) + n_candidates = min(3, len(labels)) + + for label in labels: + # it is possible to predict intent 'None' + if report.get(label): + report[label]["confused_with"] = {} + + for i, label in enumerate(labels): + for j in range(n_candidates): + label_idx = indices[i, -(1 + j)] + false_pos_label = labels[label_idx] + false_positives = int(cnf_matrix[i, label_idx]) + if false_pos_label != label and false_positives > 0: + report[label]["confused_with"][false_pos_label] = false_positives + + return report + + def evaluate_intents( intent_results: List[IntentEvaluationResult], output_directory: Optional[Text], @@ -416,6 +452,8 @@ def evaluate_intents( Others are filtered out. Returns a dictionary of containing the evaluation result. """ + import sklearn.metrics + import sklearn.utils.multiclass # remove empty intent targets num_examples = len(intent_results) @@ -431,10 +469,14 @@ def evaluate_intents( intent_results, "intent_target", "intent_prediction" ) + cnf_matrix = sklearn.metrics.confusion_matrix(target_intents, predicted_intents) + labels = sklearn.utils.multiclass.unique_labels(target_intents, predicted_intents) + if output_directory: report, precision, f1, accuracy = get_evaluation_metrics( target_intents, predicted_intents, output_dict=True ) + report = _add_confused_intents_to_report(report, cnf_matrix, labels) report_filename = os.path.join(output_directory, "intent_report.json") @@ -463,16 +505,12 @@ def evaluate_intents( collect_nlu_errors(intent_results, errors_filename) if confmat_filename: - from sklearn.metrics import confusion_matrix - from sklearn.utils.multiclass import unique_labels import matplotlib.pyplot as plt if output_directory: confmat_filename = os.path.join(output_directory, confmat_filename) intent_hist_filename = os.path.join(output_directory, intent_hist_filename) - cnf_matrix = confusion_matrix(target_intents, predicted_intents) - labels = unique_labels(target_intents, predicted_intents) plot_confusion_matrix( cnf_matrix, classes=labels, diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py index 259e47569cbe..5e564cf7e15e 100644 --- a/tests/nlu/base/test_evaluation.py +++ b/tests/nlu/base/test_evaluation.py @@ -1,7 +1,10 @@ +from typing import Text + import asyncio import logging import pytest +from _pytest.tmpdir import TempdirFactory import rasa.utils.io from rasa.test import compare_nlu_models @@ -306,7 +309,13 @@ def test_intent_evaluation_report(tmpdir_factory): report = json.loads(rasa.utils.io.read_file(report_filename)) - greet_results = {"precision": 1.0, "recall": 1.0, "f1-score": 1.0, "support": 1} + greet_results = { + "precision": 1.0, + "recall": 1.0, + "f1-score": 1.0, + "support": 1, + "confused_with": {}, + } prediction = { "text": "hello", @@ -320,6 +329,65 @@ def test_intent_evaluation_report(tmpdir_factory): assert result["predictions"][0] == prediction +def test_intent_evaluation_report_large(tmpdir_factory: TempdirFactory): + path = tmpdir_factory.mktemp("evaluation") + report_folder = path / "reports" + report_filename = report_folder / "intent_report.json" + + rasa.utils.io.create_directory(str(report_folder)) + + def correct(label: Text) -> IntentEvaluationResult: + return IntentEvaluationResult(label, label, "", 1.0) + + def incorrect(label: Text, _label: Text) -> IntentEvaluationResult: + return IntentEvaluationResult(label, _label, "", 1.0) + + a_results = [correct("A")] * 10 + b_results = [correct("B")] * 7 + [incorrect("B", "C")] * 3 + c_results = [correct("C")] * 3 + [incorrect("C", "D")] + [incorrect("C", "E")] + d_results = [correct("D")] * 29 + [incorrect("D", "B")] * 3 + e_results = [incorrect("E", "C")] * 5 + [incorrect("E", "")] * 5 + + intent_results = a_results + b_results + c_results + d_results + e_results + + evaluate_intents( + intent_results, + report_folder, + successes=False, + errors=False, + confmat_filename=None, + intent_hist_filename=None, + ) + + report = json.loads(rasa.utils.io.read_file(str(report_filename))) + + a_results = { + "precision": 1.0, + "recall": 1.0, + "f1-score": 1.0, + "support": 10, + "confused_with": {}, + } + + e_results = { + "precision": 0.0, + "recall": 0.0, + "f1-score": 0.0, + "support": 10, + "confused_with": {"C": 5, "": 5}, + } + + c_confused_with = { + "D": 1, + "E": 1, + } + + assert len(report.keys()) == 8 + assert report["A"] == a_results + assert report["E"] == e_results + assert report["C"]["confused_with"] == c_confused_with + + def test_response_evaluation_report(tmpdir_factory): path = tmpdir_factory.mktemp("evaluation").strpath report_folder = os.path.join(path, "reports")