diff --git a/changelog/4899.improvement.rst b/changelog/4899.improvement.rst
new file mode 100644
index 000000000000..98477379aca5
--- /dev/null
+++ b/changelog/4899.improvement.rst
@@ -0,0 +1,4 @@
+The ``intent_report.json`` created by ``rasa test`` now creates an extra field 
+``confused_with`` for each intent. This is a dictionary containing the names of
+the most common false positives when this intent should be predicted, and the 
+number of such false positives.
\ No newline at end of file
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 5a7b97ac7b0e..33e3b24eda10 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -6,6 +6,7 @@
 from tqdm import tqdm
 from typing import (
     Iterable,
+    Collection,
     Iterator,
     Tuple,
     List,
@@ -398,6 +399,41 @@ def evaluate_response_selections(
     }
 
 
+def _add_confused_intents_to_report(
+    report: Dict[Text, Dict[Text, float]],
+    cnf_matrix: np.ndarray,
+    labels: Collection[Text],
+) -> Dict[Text, Dict[Text, Union[Dict, float]]]:
+    """Adds a field "confused_with" to the intents in the
+    intent evaluation report. The value is a dict of
+    {"false_positive_label": false_positive_count} pairs.
+    If there are no false positives in the confusion matrix,
+    the dict will be empty. Typically we include the two most
+    commonly false positive labels, three in the rare case that
+    the diagonal element in the confusion matrix is not one of the
+    three highest values in the row.
+    """
+
+    # sort confusion matrix by false positives
+    indices = np.argsort(cnf_matrix, axis=1)
+    n_candidates = min(3, len(labels))
+
+    for label in labels:
+        # it is possible to predict intent 'None'
+        if report.get(label):
+            report[label]["confused_with"] = {}
+
+    for i, label in enumerate(labels):
+        for j in range(n_candidates):
+            label_idx = indices[i, -(1 + j)]
+            false_pos_label = labels[label_idx]
+            false_positives = int(cnf_matrix[i, label_idx])
+            if false_pos_label != label and false_positives > 0:
+                report[label]["confused_with"][false_pos_label] = false_positives
+
+    return report
+
+
 def evaluate_intents(
     intent_results: List[IntentEvaluationResult],
     output_directory: Optional[Text],
@@ -416,6 +452,8 @@ def evaluate_intents(
     Others are filtered out. Returns a dictionary of containing the
     evaluation result.
     """
+    import sklearn.metrics
+    import sklearn.utils.multiclass
 
     # remove empty intent targets
     num_examples = len(intent_results)
@@ -431,10 +469,14 @@ def evaluate_intents(
         intent_results, "intent_target", "intent_prediction"
     )
 
+    cnf_matrix = sklearn.metrics.confusion_matrix(target_intents, predicted_intents)
+    labels = sklearn.utils.multiclass.unique_labels(target_intents, predicted_intents)
+
     if output_directory:
         report, precision, f1, accuracy = get_evaluation_metrics(
             target_intents, predicted_intents, output_dict=True
         )
+        report = _add_confused_intents_to_report(report, cnf_matrix, labels)
 
         report_filename = os.path.join(output_directory, "intent_report.json")
 
@@ -463,16 +505,12 @@ def evaluate_intents(
         collect_nlu_errors(intent_results, errors_filename)
 
     if confmat_filename:
-        from sklearn.metrics import confusion_matrix
-        from sklearn.utils.multiclass import unique_labels
         import matplotlib.pyplot as plt
 
         if output_directory:
             confmat_filename = os.path.join(output_directory, confmat_filename)
             intent_hist_filename = os.path.join(output_directory, intent_hist_filename)
 
-        cnf_matrix = confusion_matrix(target_intents, predicted_intents)
-        labels = unique_labels(target_intents, predicted_intents)
         plot_confusion_matrix(
             cnf_matrix,
             classes=labels,
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index 259e47569cbe..5e564cf7e15e 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -1,7 +1,10 @@
+from typing import Text
+
 import asyncio
 import logging
 
 import pytest
+from _pytest.tmpdir import TempdirFactory
 
 import rasa.utils.io
 from rasa.test import compare_nlu_models
@@ -306,7 +309,13 @@ def test_intent_evaluation_report(tmpdir_factory):
 
     report = json.loads(rasa.utils.io.read_file(report_filename))
 
-    greet_results = {"precision": 1.0, "recall": 1.0, "f1-score": 1.0, "support": 1}
+    greet_results = {
+        "precision": 1.0,
+        "recall": 1.0,
+        "f1-score": 1.0,
+        "support": 1,
+        "confused_with": {},
+    }
 
     prediction = {
         "text": "hello",
@@ -320,6 +329,65 @@ def test_intent_evaluation_report(tmpdir_factory):
     assert result["predictions"][0] == prediction
 
 
+def test_intent_evaluation_report_large(tmpdir_factory: TempdirFactory):
+    path = tmpdir_factory.mktemp("evaluation")
+    report_folder = path / "reports"
+    report_filename = report_folder / "intent_report.json"
+
+    rasa.utils.io.create_directory(str(report_folder))
+
+    def correct(label: Text) -> IntentEvaluationResult:
+        return IntentEvaluationResult(label, label, "", 1.0)
+
+    def incorrect(label: Text, _label: Text) -> IntentEvaluationResult:
+        return IntentEvaluationResult(label, _label, "", 1.0)
+
+    a_results = [correct("A")] * 10
+    b_results = [correct("B")] * 7 + [incorrect("B", "C")] * 3
+    c_results = [correct("C")] * 3 + [incorrect("C", "D")] + [incorrect("C", "E")]
+    d_results = [correct("D")] * 29 + [incorrect("D", "B")] * 3
+    e_results = [incorrect("E", "C")] * 5 + [incorrect("E", "")] * 5
+
+    intent_results = a_results + b_results + c_results + d_results + e_results
+
+    evaluate_intents(
+        intent_results,
+        report_folder,
+        successes=False,
+        errors=False,
+        confmat_filename=None,
+        intent_hist_filename=None,
+    )
+
+    report = json.loads(rasa.utils.io.read_file(str(report_filename)))
+
+    a_results = {
+        "precision": 1.0,
+        "recall": 1.0,
+        "f1-score": 1.0,
+        "support": 10,
+        "confused_with": {},
+    }
+
+    e_results = {
+        "precision": 0.0,
+        "recall": 0.0,
+        "f1-score": 0.0,
+        "support": 10,
+        "confused_with": {"C": 5, "": 5},
+    }
+
+    c_confused_with = {
+        "D": 1,
+        "E": 1,
+    }
+
+    assert len(report.keys()) == 8
+    assert report["A"] == a_results
+    assert report["E"] == e_results
+    assert report["C"]["confused_with"] == c_confused_with
+
+
 def test_response_evaluation_report(tmpdir_factory):
     path = tmpdir_factory.mktemp("evaluation").strpath
     report_folder = os.path.join(path, "reports")