Skip to content

Commit

Permalink
feat: yield tables (#159)
Browse files Browse the repository at this point in the history
* adding yield tables compatible with tabulate package
* yield tables integrated in visualize.data_MC
* adding tabulate as core dependency
* model_utils.calculate_stdev returns list instead of awkward array
  • Loading branch information
alexander-held authored Dec 7, 2020
1 parent fc984e4 commit 36035e7
Show file tree
Hide file tree
Showing 8 changed files with 187 additions and 9 deletions.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ install_requires =
click
awkward1
scipy
tabulate

[options.packages.find]
where = src
Expand Down
1 change: 1 addition & 0 deletions src/cabinetry/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from . import configuration # NOQA
from . import fit # NOQA
from . import route # NOQA
from . import tabulate # NOQA
from . import template_builder # NOQA
from . import template_postprocessor # NOQA
from . import model_utils # NOQA
Expand Down
6 changes: 3 additions & 3 deletions src/cabinetry/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def calculate_stdev(
parameters: np.ndarray,
uncertainty: np.ndarray,
corr_mat: np.ndarray,
) -> ak.highlevel.Array:
) -> List[List[float]]:
"""Calculates the symmetrized yield standard deviation of a model.
Args:
Expand All @@ -189,7 +189,7 @@ def calculate_stdev(
corr_mat (np.ndarray): correlation matrix
Returns:
ak.highlevel.Array: array of channels, each channel is an array of standard
List[List[float]]: list of channels, each channel is a list of standard
deviations per bin
"""
# indices where to split to separate all bins into regions
Expand Down Expand Up @@ -260,7 +260,7 @@ def calculate_stdev(
# convert to standard deviation
total_stdev = np.sqrt(total_variance)
log.debug(f"total stdev is {total_stdev}")
return total_stdev
return ak.to_list(total_stdev)


def unconstrained_parameter_count(model: pyhf.pdf.Model) -> int:
Expand Down
85 changes: 85 additions & 0 deletions src/cabinetry/tabulate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import logging
from typing import Any, Dict, List

import numpy as np
import pyhf
import tabulate


log = logging.getLogger(__name__)


def _header_name(channel_name: str, i_bin: int) -> str:
"""Constructs the header name for a column in a yield table.
Args:
channel_name (str): name of the channel (phase space region)
i_bin (int): index of bin in channel
Returns:
str: the header name to be used for the column
"""
if i_bin == 0:
header_name = f"{channel_name}\nbin {i_bin+1}"
else:
header_name = f"\nbin {i_bin+1}"
return header_name


def _yields(
model: pyhf.pdf.Model,
model_yields: List[np.ndarray],
total_stdev_model: List[List[float]],
data: List[np.ndarray],
) -> List[Dict[str, Any]]:
"""Outputs and returns a yield table with predicted and observed yields per bin.
Args:
model (pyhf.pdf.Model): the model which the table corresponds to
model_yields (List[np.ndarray]): yields per channel, sample, and bin
total_stdev_model (List[List[float]]): total model standard deviation per
channel and bin
data (List[np.ndarray]): data yield per channel and bin
Returns:
List[Dict[str, Any]]: yield table for use with the ``tabulate`` package
"""
table = [] # table containing all yields

# rows for each individual sample
for i_sam, sample_name in enumerate(model.config.samples):
sample_dict = {"sample": sample_name} # one dict per sample
for i_chan, channel_name in enumerate(model.config.channels):
for i_bin in range(model.config.channel_nbins[channel_name]):
sample_dict.update(
{
_header_name(
channel_name, i_bin
): f"{model_yields[i_chan][i_sam][i_bin]:.2f}"
}
)
table.append(sample_dict)

# dicts for total model prediction and data
total_dict = {"sample": "total"}
data_dict = {"sample": "data"}
for i_chan, channel_name in enumerate(model.config.channels):
total_model = np.sum(model_yields[i_chan], axis=0) # sum over samples
for i_bin in range(model.config.channel_nbins[channel_name]):
total_dict.update(
{
_header_name(channel_name, i_bin): f"{total_model[i_bin]:.2f} "
f"\u00B1 {total_stdev_model[i_chan][i_bin]:.2f}"
}
)
data_dict.update(
{_header_name(channel_name, i_bin): f"{data[i_chan][i_bin]:.2f}"}
)
table += [total_dict, data_dict]

log.info(
"yield table:\n"
+ tabulate.tabulate(table, headers="keys", tablefmt="fancy_grid")
)

return table
17 changes: 14 additions & 3 deletions src/cabinetry/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from . import fit
from . import histo
from . import model_utils
from . import tabulate
from . import template_builder


Expand Down Expand Up @@ -116,6 +117,7 @@ def data_MC(
figure_folder: Union[str, pathlib.Path] = "figures",
fit_results: Optional[fit.FitResults] = None,
log_scale: Optional[bool] = None,
include_table: bool = True,
method: str = "matplotlib",
) -> None:
"""Draws pre- and post-fit data/MC histograms from a ``pyhf`` workspace.
Expand All @@ -130,6 +132,8 @@ def data_MC(
defaults to None (then the pre-fit configuration is drawn)
log_scale (Optional[bool], optional): whether to use logarithmic vertical axis,
defaults to None (automatically determine whether to use linear/log scale)
include_table (bool, optional): whether to also output a yield table, defaults
to True
method (str, optional): backend to use for plotting, defaults to "matplotlib"
Raises:
Expand Down Expand Up @@ -168,9 +172,16 @@ def data_MC(
model, param_values, param_uncertainty, corr_mat
)

for i_chan, channel_name in enumerate(
model.config.channels
): # process channel by channel
if include_table:
# show yield table
if prefit:
log.info("generating pre-fit yield table")
else:
log.info("generating post-fit yield table")
tabulate._yields(model, model_yields, total_stdev_model, data)

# process channel by channel
for i_chan, channel_name in enumerate(model.config.channels):
histogram_dict_list = [] # one dict per region/channel

# get the region dictionary from the config for binning / variable name
Expand Down
2 changes: 1 addition & 1 deletion tests/test_model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def test_calculate_stdev(example_spec, example_spec_multibin):
total_stdev = model_utils.calculate_stdev(model, parameters, uncertainty, corr_mat)
expected_stdev = [[8.056054, 1.670629], [2.775377]]
for i_reg in range(2):
assert np.allclose(ak.to_list(total_stdev[i_reg]), expected_stdev[i_reg])
assert np.allclose(total_stdev[i_reg], expected_stdev[i_reg])


def test_unconstrained_parameter_count(example_spec, example_spec_shapefactor):
Expand Down
60 changes: 60 additions & 0 deletions tests/test_tabulate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy as np
import pyhf
import pytest

from cabinetry import tabulate


@pytest.mark.parametrize(
"test_input, expected",
[
(("abc", 0), "abc\nbin 1"),
(("abc", 2), "\nbin 3"),
],
)
def test__header_name(test_input, expected):
assert tabulate._header_name(*test_input) == expected


def test__yields(example_spec_multibin, example_spec_with_background):
# multiple channels
model = pyhf.Workspace(example_spec_multibin).model()
yields = [np.asarray([[25.0, 5.0]]), np.asarray([[8.0]])]
total_stdev = [[5.0, 2.0], [1.0]]
data = [np.asarray([35, 8]), np.asarray([10])]

yield_table = tabulate._yields(model, yields, total_stdev, data)
assert yield_table == [
{
"sample": "Signal",
"region_1\nbin 1": "25.00",
"\nbin 2": "5.00",
"region_2\nbin 1": "8.00",
},
{
"sample": "total",
"region_1\nbin 1": "25.00 \u00B1 5.00",
"\nbin 2": "5.00 \u00B1 2.00",
"region_2\nbin 1": "8.00 \u00B1 1.00",
},
{
"sample": "data",
"region_1\nbin 1": "35.00",
"\nbin 2": "8.00",
"region_2\nbin 1": "10.00",
},
]

# multiple samples
model = pyhf.Workspace(example_spec_with_background).model()
yields = [np.asarray([[150.0], [50.0]])]
total_stdev = [[8.60]]
data = [np.asarray([160])]

yield_table = tabulate._yields(model, yields, total_stdev, data)
assert yield_table == [
{"sample": "Background", "Signal Region\nbin 1": "150.00"},
{"sample": "Signal", "Signal Region\nbin 1": "50.00"},
{"sample": "total", "Signal Region\nbin 1": "200.00 \u00B1 8.60"},
{"sample": "data", "Signal Region\nbin 1": "160.00"},
]
24 changes: 22 additions & 2 deletions tests/test_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ def test_data_MC_from_histograms(mock_load, mock_draw, mock_stdev):
"cabinetry.configuration.get_region_dict",
return_value={"Name": "region", "Variable": "x"},
)
@mock.patch("cabinetry.model_utils.calculate_stdev", return_value=np.asarray([[0.3]]))
@mock.patch("cabinetry.tabulate._yields")
@mock.patch("cabinetry.model_utils.calculate_stdev", return_value=[[0.3]])
@mock.patch(
"cabinetry.model_utils.get_prefit_uncertainties",
return_value=([0.04956657, 0.0]),
Expand All @@ -130,7 +131,14 @@ def test_data_MC_from_histograms(mock_load, mock_draw, mock_stdev):
return_value=([1.0, 1.0]),
)
def test_data_MC(
mock_asimov, mock_unc, mock_stdev, mock_dict, mock_bins, mock_draw, example_spec
mock_asimov,
mock_unc,
mock_stdev,
mock_table,
mock_dict,
mock_bins,
mock_draw,
example_spec,
):
config = {}
figure_folder = "tmp"
Expand All @@ -155,6 +163,14 @@ def test_data_MC(
)
assert mock_stdev.call_args_list[0][1] == {}

# yield table
assert mock_table.call_count == 1
assert mock_table.call_args_list[0][0][0].spec == model_spec
assert mock_table.call_args_list[0][0][1] == [np.asarray([[51.839756]])]
assert mock_table.call_args_list[0][0][2] == [[0.3]]
assert mock_table.call_args_list[0][0][3] == [np.asarray([475])]
assert mock_table.call_args_list[0][1] == {}

assert mock_dict.call_args_list == [[(config, "Signal Region"), {}]]
assert mock_bins.call_args_list == [[({"Name": "region", "Variable": "x"},), {}]]

Expand Down Expand Up @@ -219,6 +235,10 @@ def test_data_MC(
)
assert mock_draw.call_args_list[1][1] == {"log_scale": False}

# no yield table
visualize.data_MC(config, example_spec, include_table=False)
assert mock_table.call_count == 2 # 2 calls from before

# unknown plotting method
with pytest.raises(NotImplementedError, match="unknown backend: unknown"):
visualize.data_MC(
Expand Down

0 comments on commit 36035e7

Please sign in to comment.