feat: yield tables (#159)

* adding yield tables compatible with tabulate package * yield tables integrated in visualize.data_MC * adding tabulate as core dependency * model_utils.calculate_stdev returns list instead of awkward array
scikit-hep · Dec 7, 2020 · 36035e7 · 36035e7
1 parent fc984e4
commit 36035e7
Show file tree

Hide file tree

Showing 8 changed files with 187 additions and 9 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -30,6 +30,7 @@ install_requires =
     click
     awkward1
     scipy
+    tabulate
 
 [options.packages.find]
 where = src

diff --git a/src/cabinetry/__init__.py b/src/cabinetry/__init__.py
@@ -1,6 +1,7 @@
 from . import configuration  # NOQA
 from . import fit  # NOQA
 from . import route  # NOQA
+from . import tabulate  # NOQA
 from . import template_builder  # NOQA
 from . import template_postprocessor  # NOQA
 from . import model_utils  # NOQA

diff --git a/src/cabinetry/model_utils.py b/src/cabinetry/model_utils.py
@@ -178,7 +178,7 @@ def calculate_stdev(
     parameters: np.ndarray,
     uncertainty: np.ndarray,
     corr_mat: np.ndarray,
-) -> ak.highlevel.Array:
+) -> List[List[float]]:
     """Calculates the symmetrized yield standard deviation of a model.
 
     Args:
@@ -189,7 +189,7 @@ def calculate_stdev(
         corr_mat (np.ndarray): correlation matrix
 
     Returns:
-        ak.highlevel.Array: array of channels, each channel is an array of standard
+        List[List[float]]: list of channels, each channel is a list of standard
         deviations per bin
     """
     # indices where to split to separate all bins into regions
@@ -260,7 +260,7 @@ def calculate_stdev(
     # convert to standard deviation
     total_stdev = np.sqrt(total_variance)
     log.debug(f"total stdev is {total_stdev}")
-    return total_stdev
+    return ak.to_list(total_stdev)
 
 
 def unconstrained_parameter_count(model: pyhf.pdf.Model) -> int:

diff --git a/src/cabinetry/tabulate.py b/src/cabinetry/tabulate.py
@@ -0,0 +1,85 @@
+import logging
+from typing import Any, Dict, List
+
+import numpy as np
+import pyhf
+import tabulate
+
+
+log = logging.getLogger(__name__)
+
+
+def _header_name(channel_name: str, i_bin: int) -> str:
+    """Constructs the header name for a column in a yield table.
+
+    Args:
+        channel_name (str): name of the channel (phase space region)
+        i_bin (int): index of bin in channel
+
+    Returns:
+        str: the header name to be used for the column
+    """
+    if i_bin == 0:
+        header_name = f"{channel_name}\nbin {i_bin+1}"
+    else:
+        header_name = f"\nbin {i_bin+1}"
+    return header_name
+
+
+def _yields(
+    model: pyhf.pdf.Model,
+    model_yields: List[np.ndarray],
+    total_stdev_model: List[List[float]],
+    data: List[np.ndarray],
+) -> List[Dict[str, Any]]:
+    """Outputs and returns a yield table with predicted and observed yields per bin.
+
+    Args:
+        model (pyhf.pdf.Model): the model which the table corresponds to
+        model_yields (List[np.ndarray]): yields per channel, sample, and bin
+        total_stdev_model (List[List[float]]): total model standard deviation per
+            channel and bin
+        data (List[np.ndarray]): data yield per channel and bin
+
+    Returns:
+        List[Dict[str, Any]]: yield table for use with the ``tabulate`` package
+    """
+    table = []  # table containing all yields
+
+    # rows for each individual sample
+    for i_sam, sample_name in enumerate(model.config.samples):
+        sample_dict = {"sample": sample_name}  # one dict per sample
+        for i_chan, channel_name in enumerate(model.config.channels):
+            for i_bin in range(model.config.channel_nbins[channel_name]):
+                sample_dict.update(
+                    {
+                        _header_name(
+                            channel_name, i_bin
+                        ): f"{model_yields[i_chan][i_sam][i_bin]:.2f}"
+                    }
+                )
+        table.append(sample_dict)
+
+    # dicts for total model prediction and data
+    total_dict = {"sample": "total"}
+    data_dict = {"sample": "data"}
+    for i_chan, channel_name in enumerate(model.config.channels):
+        total_model = np.sum(model_yields[i_chan], axis=0)  # sum over samples
+        for i_bin in range(model.config.channel_nbins[channel_name]):
+            total_dict.update(
+                {
+                    _header_name(channel_name, i_bin): f"{total_model[i_bin]:.2f} "
+                    f"\u00B1 {total_stdev_model[i_chan][i_bin]:.2f}"
+                }
+            )
+            data_dict.update(
+                {_header_name(channel_name, i_bin): f"{data[i_chan][i_bin]:.2f}"}
+            )
+    table += [total_dict, data_dict]
+
+    log.info(
+        "yield table:\n"
+        + tabulate.tabulate(table, headers="keys", tablefmt="fancy_grid")
+    )
+
+    return table
diff --git a/src/cabinetry/visualize.py b/src/cabinetry/visualize.py
@@ -9,6 +9,7 @@
 from . import fit
 from . import histo
 from . import model_utils
+from . import tabulate
 from . import template_builder
 
 
@@ -116,6 +117,7 @@ def data_MC(
     figure_folder: Union[str, pathlib.Path] = "figures",
     fit_results: Optional[fit.FitResults] = None,
     log_scale: Optional[bool] = None,
+    include_table: bool = True,
     method: str = "matplotlib",
 ) -> None:
     """Draws pre- and post-fit data/MC histograms from a ``pyhf`` workspace.
@@ -130,6 +132,8 @@ def data_MC(
             defaults to None (then the pre-fit configuration is drawn)
         log_scale (Optional[bool], optional): whether to use logarithmic vertical axis,
             defaults to None (automatically determine whether to use linear/log scale)
+        include_table (bool, optional): whether to also output a yield table, defaults
+            to True
         method (str, optional): backend to use for plotting, defaults to "matplotlib"
 
     Raises:
@@ -168,9 +172,16 @@ def data_MC(
         model, param_values, param_uncertainty, corr_mat
     )
 
-    for i_chan, channel_name in enumerate(
-        model.config.channels
-    ):  # process channel by channel
+    if include_table:
+        # show yield table
+        if prefit:
+            log.info("generating pre-fit yield table")
+        else:
+            log.info("generating post-fit yield table")
+        tabulate._yields(model, model_yields, total_stdev_model, data)
+
+    # process channel by channel
+    for i_chan, channel_name in enumerate(model.config.channels):
         histogram_dict_list = []  # one dict per region/channel
 
         # get the region dictionary from the config for binning / variable name

diff --git a/tests/test_model_utils.py b/tests/test_model_utils.py
@@ -139,7 +139,7 @@ def test_calculate_stdev(example_spec, example_spec_multibin):
     total_stdev = model_utils.calculate_stdev(model, parameters, uncertainty, corr_mat)
     expected_stdev = [[8.056054, 1.670629], [2.775377]]
     for i_reg in range(2):
-        assert np.allclose(ak.to_list(total_stdev[i_reg]), expected_stdev[i_reg])
+        assert np.allclose(total_stdev[i_reg], expected_stdev[i_reg])
 
 
 def test_unconstrained_parameter_count(example_spec, example_spec_shapefactor):

diff --git a/tests/test_tabulate.py b/tests/test_tabulate.py
@@ -0,0 +1,60 @@
+import numpy as np
+import pyhf
+import pytest
+
+from cabinetry import tabulate
+
+
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [
+        (("abc", 0), "abc\nbin 1"),
+        (("abc", 2), "\nbin 3"),
+    ],
+)
+def test__header_name(test_input, expected):
+    assert tabulate._header_name(*test_input) == expected
+
+
+def test__yields(example_spec_multibin, example_spec_with_background):
+    # multiple channels
+    model = pyhf.Workspace(example_spec_multibin).model()
+    yields = [np.asarray([[25.0, 5.0]]), np.asarray([[8.0]])]
+    total_stdev = [[5.0, 2.0], [1.0]]
+    data = [np.asarray([35, 8]), np.asarray([10])]
+
+    yield_table = tabulate._yields(model, yields, total_stdev, data)
+    assert yield_table == [
+        {
+            "sample": "Signal",
+            "region_1\nbin 1": "25.00",
+            "\nbin 2": "5.00",
+            "region_2\nbin 1": "8.00",
+        },
+        {
+            "sample": "total",
+            "region_1\nbin 1": "25.00 \u00B1 5.00",
+            "\nbin 2": "5.00 \u00B1 2.00",
+            "region_2\nbin 1": "8.00 \u00B1 1.00",
+        },
+        {
+            "sample": "data",
+            "region_1\nbin 1": "35.00",
+            "\nbin 2": "8.00",
+            "region_2\nbin 1": "10.00",
+        },
+    ]
+
+    # multiple samples
+    model = pyhf.Workspace(example_spec_with_background).model()
+    yields = [np.asarray([[150.0], [50.0]])]
+    total_stdev = [[8.60]]
+    data = [np.asarray([160])]
+
+    yield_table = tabulate._yields(model, yields, total_stdev, data)
+    assert yield_table == [
+        {"sample": "Background", "Signal Region\nbin 1": "150.00"},
+        {"sample": "Signal", "Signal Region\nbin 1": "50.00"},
+        {"sample": "total", "Signal Region\nbin 1": "200.00 \u00B1 8.60"},
+        {"sample": "data", "Signal Region\nbin 1": "160.00"},
+    ]
diff --git a/tests/test_visualize.py b/tests/test_visualize.py
@@ -120,7 +120,8 @@ def test_data_MC_from_histograms(mock_load, mock_draw, mock_stdev):
     "cabinetry.configuration.get_region_dict",
     return_value={"Name": "region", "Variable": "x"},
 )
-@mock.patch("cabinetry.model_utils.calculate_stdev", return_value=np.asarray([[0.3]]))
+@mock.patch("cabinetry.tabulate._yields")
+@mock.patch("cabinetry.model_utils.calculate_stdev", return_value=[[0.3]])
 @mock.patch(
     "cabinetry.model_utils.get_prefit_uncertainties",
     return_value=([0.04956657, 0.0]),
@@ -130,7 +131,14 @@ def test_data_MC_from_histograms(mock_load, mock_draw, mock_stdev):
     return_value=([1.0, 1.0]),
 )
 def test_data_MC(
-    mock_asimov, mock_unc, mock_stdev, mock_dict, mock_bins, mock_draw, example_spec
+    mock_asimov,
+    mock_unc,
+    mock_stdev,
+    mock_table,
+    mock_dict,
+    mock_bins,
+    mock_draw,
+    example_spec,
 ):
     config = {}
     figure_folder = "tmp"
@@ -155,6 +163,14 @@ def test_data_MC(
     )
     assert mock_stdev.call_args_list[0][1] == {}
 
+    # yield table
+    assert mock_table.call_count == 1
+    assert mock_table.call_args_list[0][0][0].spec == model_spec
+    assert mock_table.call_args_list[0][0][1] == [np.asarray([[51.839756]])]
+    assert mock_table.call_args_list[0][0][2] == [[0.3]]
+    assert mock_table.call_args_list[0][0][3] == [np.asarray([475])]
+    assert mock_table.call_args_list[0][1] == {}
+
     assert mock_dict.call_args_list == [[(config, "Signal Region"), {}]]
     assert mock_bins.call_args_list == [[({"Name": "region", "Variable": "x"},), {}]]
 
@@ -219,6 +235,10 @@ def test_data_MC(
     )
     assert mock_draw.call_args_list[1][1] == {"log_scale": False}
 
+    # no yield table
+    visualize.data_MC(config, example_spec, include_table=False)
+    assert mock_table.call_count == 2  # 2 calls from before
+
     # unknown plotting method
     with pytest.raises(NotImplementedError, match="unknown backend: unknown"):
         visualize.data_MC(