diff --git a/setup.cfg b/setup.cfg index 9277f095..0feb8470 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,7 @@ install_requires = click awkward1 scipy + tabulate [options.packages.find] where = src diff --git a/src/cabinetry/__init__.py b/src/cabinetry/__init__.py index 1e7a4128..6a8ddd04 100644 --- a/src/cabinetry/__init__.py +++ b/src/cabinetry/__init__.py @@ -1,6 +1,7 @@ from . import configuration # NOQA from . import fit # NOQA from . import route # NOQA +from . import tabulate # NOQA from . import template_builder # NOQA from . import template_postprocessor # NOQA from . import model_utils # NOQA diff --git a/src/cabinetry/model_utils.py b/src/cabinetry/model_utils.py index c91b6a9b..e5798aad 100644 --- a/src/cabinetry/model_utils.py +++ b/src/cabinetry/model_utils.py @@ -178,7 +178,7 @@ def calculate_stdev( parameters: np.ndarray, uncertainty: np.ndarray, corr_mat: np.ndarray, -) -> ak.highlevel.Array: +) -> List[List[float]]: """Calculates the symmetrized yield standard deviation of a model. Args: @@ -189,7 +189,7 @@ def calculate_stdev( corr_mat (np.ndarray): correlation matrix Returns: - ak.highlevel.Array: array of channels, each channel is an array of standard + List[List[float]]: list of channels, each channel is a list of standard deviations per bin """ # indices where to split to separate all bins into regions @@ -260,7 +260,7 @@ def calculate_stdev( # convert to standard deviation total_stdev = np.sqrt(total_variance) log.debug(f"total stdev is {total_stdev}") - return total_stdev + return ak.to_list(total_stdev) def unconstrained_parameter_count(model: pyhf.pdf.Model) -> int: diff --git a/src/cabinetry/tabulate.py b/src/cabinetry/tabulate.py new file mode 100644 index 00000000..138aa918 --- /dev/null +++ b/src/cabinetry/tabulate.py @@ -0,0 +1,85 @@ +import logging +from typing import Any, Dict, List + +import numpy as np +import pyhf +import tabulate + + +log = logging.getLogger(__name__) + + +def _header_name(channel_name: str, i_bin: int) -> str: + """Constructs the header name for a column in a yield table. + + Args: + channel_name (str): name of the channel (phase space region) + i_bin (int): index of bin in channel + + Returns: + str: the header name to be used for the column + """ + if i_bin == 0: + header_name = f"{channel_name}\nbin {i_bin+1}" + else: + header_name = f"\nbin {i_bin+1}" + return header_name + + +def _yields( + model: pyhf.pdf.Model, + model_yields: List[np.ndarray], + total_stdev_model: List[List[float]], + data: List[np.ndarray], +) -> List[Dict[str, Any]]: + """Outputs and returns a yield table with predicted and observed yields per bin. + + Args: + model (pyhf.pdf.Model): the model which the table corresponds to + model_yields (List[np.ndarray]): yields per channel, sample, and bin + total_stdev_model (List[List[float]]): total model standard deviation per + channel and bin + data (List[np.ndarray]): data yield per channel and bin + + Returns: + List[Dict[str, Any]]: yield table for use with the ``tabulate`` package + """ + table = [] # table containing all yields + + # rows for each individual sample + for i_sam, sample_name in enumerate(model.config.samples): + sample_dict = {"sample": sample_name} # one dict per sample + for i_chan, channel_name in enumerate(model.config.channels): + for i_bin in range(model.config.channel_nbins[channel_name]): + sample_dict.update( + { + _header_name( + channel_name, i_bin + ): f"{model_yields[i_chan][i_sam][i_bin]:.2f}" + } + ) + table.append(sample_dict) + + # dicts for total model prediction and data + total_dict = {"sample": "total"} + data_dict = {"sample": "data"} + for i_chan, channel_name in enumerate(model.config.channels): + total_model = np.sum(model_yields[i_chan], axis=0) # sum over samples + for i_bin in range(model.config.channel_nbins[channel_name]): + total_dict.update( + { + _header_name(channel_name, i_bin): f"{total_model[i_bin]:.2f} " + f"\u00B1 {total_stdev_model[i_chan][i_bin]:.2f}" + } + ) + data_dict.update( + {_header_name(channel_name, i_bin): f"{data[i_chan][i_bin]:.2f}"} + ) + table += [total_dict, data_dict] + + log.info( + "yield table:\n" + + tabulate.tabulate(table, headers="keys", tablefmt="fancy_grid") + ) + + return table diff --git a/src/cabinetry/visualize.py b/src/cabinetry/visualize.py index 03488434..0606eb38 100644 --- a/src/cabinetry/visualize.py +++ b/src/cabinetry/visualize.py @@ -9,6 +9,7 @@ from . import fit from . import histo from . import model_utils +from . import tabulate from . import template_builder @@ -116,6 +117,7 @@ def data_MC( figure_folder: Union[str, pathlib.Path] = "figures", fit_results: Optional[fit.FitResults] = None, log_scale: Optional[bool] = None, + include_table: bool = True, method: str = "matplotlib", ) -> None: """Draws pre- and post-fit data/MC histograms from a ``pyhf`` workspace. @@ -130,6 +132,8 @@ def data_MC( defaults to None (then the pre-fit configuration is drawn) log_scale (Optional[bool], optional): whether to use logarithmic vertical axis, defaults to None (automatically determine whether to use linear/log scale) + include_table (bool, optional): whether to also output a yield table, defaults + to True method (str, optional): backend to use for plotting, defaults to "matplotlib" Raises: @@ -168,9 +172,16 @@ def data_MC( model, param_values, param_uncertainty, corr_mat ) - for i_chan, channel_name in enumerate( - model.config.channels - ): # process channel by channel + if include_table: + # show yield table + if prefit: + log.info("generating pre-fit yield table") + else: + log.info("generating post-fit yield table") + tabulate._yields(model, model_yields, total_stdev_model, data) + + # process channel by channel + for i_chan, channel_name in enumerate(model.config.channels): histogram_dict_list = [] # one dict per region/channel # get the region dictionary from the config for binning / variable name diff --git a/tests/test_model_utils.py b/tests/test_model_utils.py index 28ddab42..f2010561 100644 --- a/tests/test_model_utils.py +++ b/tests/test_model_utils.py @@ -139,7 +139,7 @@ def test_calculate_stdev(example_spec, example_spec_multibin): total_stdev = model_utils.calculate_stdev(model, parameters, uncertainty, corr_mat) expected_stdev = [[8.056054, 1.670629], [2.775377]] for i_reg in range(2): - assert np.allclose(ak.to_list(total_stdev[i_reg]), expected_stdev[i_reg]) + assert np.allclose(total_stdev[i_reg], expected_stdev[i_reg]) def test_unconstrained_parameter_count(example_spec, example_spec_shapefactor): diff --git a/tests/test_tabulate.py b/tests/test_tabulate.py new file mode 100644 index 00000000..8189bcc9 --- /dev/null +++ b/tests/test_tabulate.py @@ -0,0 +1,60 @@ +import numpy as np +import pyhf +import pytest + +from cabinetry import tabulate + + +@pytest.mark.parametrize( + "test_input, expected", + [ + (("abc", 0), "abc\nbin 1"), + (("abc", 2), "\nbin 3"), + ], +) +def test__header_name(test_input, expected): + assert tabulate._header_name(*test_input) == expected + + +def test__yields(example_spec_multibin, example_spec_with_background): + # multiple channels + model = pyhf.Workspace(example_spec_multibin).model() + yields = [np.asarray([[25.0, 5.0]]), np.asarray([[8.0]])] + total_stdev = [[5.0, 2.0], [1.0]] + data = [np.asarray([35, 8]), np.asarray([10])] + + yield_table = tabulate._yields(model, yields, total_stdev, data) + assert yield_table == [ + { + "sample": "Signal", + "region_1\nbin 1": "25.00", + "\nbin 2": "5.00", + "region_2\nbin 1": "8.00", + }, + { + "sample": "total", + "region_1\nbin 1": "25.00 \u00B1 5.00", + "\nbin 2": "5.00 \u00B1 2.00", + "region_2\nbin 1": "8.00 \u00B1 1.00", + }, + { + "sample": "data", + "region_1\nbin 1": "35.00", + "\nbin 2": "8.00", + "region_2\nbin 1": "10.00", + }, + ] + + # multiple samples + model = pyhf.Workspace(example_spec_with_background).model() + yields = [np.asarray([[150.0], [50.0]])] + total_stdev = [[8.60]] + data = [np.asarray([160])] + + yield_table = tabulate._yields(model, yields, total_stdev, data) + assert yield_table == [ + {"sample": "Background", "Signal Region\nbin 1": "150.00"}, + {"sample": "Signal", "Signal Region\nbin 1": "50.00"}, + {"sample": "total", "Signal Region\nbin 1": "200.00 \u00B1 8.60"}, + {"sample": "data", "Signal Region\nbin 1": "160.00"}, + ] diff --git a/tests/test_visualize.py b/tests/test_visualize.py index 95003779..920a6d29 100644 --- a/tests/test_visualize.py +++ b/tests/test_visualize.py @@ -120,7 +120,8 @@ def test_data_MC_from_histograms(mock_load, mock_draw, mock_stdev): "cabinetry.configuration.get_region_dict", return_value={"Name": "region", "Variable": "x"}, ) -@mock.patch("cabinetry.model_utils.calculate_stdev", return_value=np.asarray([[0.3]])) +@mock.patch("cabinetry.tabulate._yields") +@mock.patch("cabinetry.model_utils.calculate_stdev", return_value=[[0.3]]) @mock.patch( "cabinetry.model_utils.get_prefit_uncertainties", return_value=([0.04956657, 0.0]), @@ -130,7 +131,14 @@ def test_data_MC_from_histograms(mock_load, mock_draw, mock_stdev): return_value=([1.0, 1.0]), ) def test_data_MC( - mock_asimov, mock_unc, mock_stdev, mock_dict, mock_bins, mock_draw, example_spec + mock_asimov, + mock_unc, + mock_stdev, + mock_table, + mock_dict, + mock_bins, + mock_draw, + example_spec, ): config = {} figure_folder = "tmp" @@ -155,6 +163,14 @@ def test_data_MC( ) assert mock_stdev.call_args_list[0][1] == {} + # yield table + assert mock_table.call_count == 1 + assert mock_table.call_args_list[0][0][0].spec == model_spec + assert mock_table.call_args_list[0][0][1] == [np.asarray([[51.839756]])] + assert mock_table.call_args_list[0][0][2] == [[0.3]] + assert mock_table.call_args_list[0][0][3] == [np.asarray([475])] + assert mock_table.call_args_list[0][1] == {} + assert mock_dict.call_args_list == [[(config, "Signal Region"), {}]] assert mock_bins.call_args_list == [[({"Name": "region", "Variable": "x"},), {}]] @@ -219,6 +235,10 @@ def test_data_MC( ) assert mock_draw.call_args_list[1][1] == {"log_scale": False} + # no yield table + visualize.data_MC(config, example_spec, include_table=False) + assert mock_table.call_count == 2 # 2 calls from before + # unknown plotting method with pytest.raises(NotImplementedError, match="unknown backend: unknown"): visualize.data_MC(