Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherry pick hyperopt plots #2567

Merged
merged 2 commits into from
Sep 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion ludwig/utils/visualization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import numpy as np
import pandas as pd
import ptitprince as pt
from packaging import version

from ludwig.constants import SPACE, TRAINING, VALIDATION
Expand Down Expand Up @@ -1369,7 +1370,27 @@ def hyperopt_float_plot(hyperopt_results_df, hp_name, metric, title, filename, l
def hyperopt_category_plot(hyperopt_results_df, hp_name, metric, title, filename, log_scale=True):
sns.set_style("whitegrid")
plt.figure()
seaborn_figure = sns.violinplot(x=hp_name, y=metric, data=hyperopt_results_df, fit_reg=False)

# Ensure that all parameter values have at least 2 trials, otherwise the Raincloud Plot will create awkward
# looking "flat clouds" in the cloud part of the plot (the "rain" part is ok with 1 trial). In this case,
# just use stripplots since they are categorical scatter plots.
parameter_to_trial_count = hyperopt_results_df[hp_name].value_counts()
parameter_to_trial_count = parameter_to_trial_count[parameter_to_trial_count < 2]

if len(parameter_to_trial_count) != 0:
seaborn_figure = sns.stripplot(x=hp_name, y=metric, data=hyperopt_results_df, size=5)
else:
seaborn_figure = pt.RainCloud(
x=hp_name,
y=metric,
data=hyperopt_results_df,
palette="Set2",
bw=0.2,
width_viol=0.7,
point_size=6,
cut=1,
)

seaborn_figure.set_title(title)
seaborn_figure.set(ylabel=metric)
sns.despine()
Expand All @@ -1387,6 +1408,10 @@ def hyperopt_pair_plot(hyperopt_results_df, metric, title, filename):
params.remove(metric)
num_param = len(params)

# Pair plot is empty if there's only 1 parameter, so skip creating a pair plot
if num_param == 1:
return

sns.set_style("white")
fig = plt.figure(figsize=(20, 20))
fig.suptitle(title)
Expand Down
3 changes: 2 additions & 1 deletion requirements_viz.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
matplotlib>=3.4; python_version > '3.6'
matplotlib>=3.0,<3.4; python_version <= '3.6'
seaborn>=0.7
seaborn>=0.7,<0.12
hiplot
ptitprince
73 changes: 48 additions & 25 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,28 +56,37 @@ def yaml_filename():


@pytest.fixture(scope="module")
def hyperopt_results():
"""This function generates hyperopt results."""
input_features = [
text_feature(name="utterance", encoder={"cell_type": "lstm", "reduce_output": "sum"}),
category_feature(encoder={"vocab_size": 2}, reduce_input="sum"),
]

output_features = [category_feature(decoder={"vocab_size": 2}, reduce_input="sum")]

csv_filename = uuid.uuid4().hex[:10].upper() + ".csv"
rel_path = generate_data(input_features, output_features, csv_filename)

config = {
INPUT_FEATURES: input_features,
OUTPUT_FEATURES: output_features,
COMBINER: {TYPE: "concat", "num_fc_layers": 2},
TRAINER: {EPOCHS: 2, "learning_rate": 0.001},
def hyperopt_results_single_parameter():
config, rel_path = _get_sample_config()
config[HYPEROPT] = {
"parameters": {
"trainer.learning_rate": {
"space": "loguniform",
"lower": 0.0001,
"upper": 0.01,
}
},
"goal": "minimize",
"output_feature": config[OUTPUT_FEATURES][0][NAME],
"validation_metrics": "loss",
"executor": {
"type": "ray",
"num_samples": 2,
},
"search_alg": {
"type": "variant_generator",
},
}
# Prevent resume from failure since this results in failures in other tests
hyperopt(config, dataset=rel_path, output_directory="results", experiment_name="hyperopt_test", resume=False)
return os.path.join(os.path.abspath("results"), "hyperopt_test")

output_feature_name = output_features[0][NAME]

hyperopt_configs = {
@pytest.fixture(scope="module")
def hyperopt_results_multiple_parameters():
config, rel_path = _get_sample_config()
output_feature_name = config[OUTPUT_FEATURES][0][NAME]
config[HYPEROPT] = {
"parameters": {
"trainer.learning_rate": {
"space": "loguniform",
Expand All @@ -98,12 +107,8 @@ def hyperopt_results():
"type": "variant_generator",
},
}

# add hyperopt parameter space to the config
config[HYPEROPT] = hyperopt_configs

hyperopt(config, dataset=rel_path, output_directory="results", experiment_name="hyperopt_test")

# Prevent resume from failure since this results in failures in other tests
hyperopt(config, dataset=rel_path, output_directory="results", experiment_name="hyperopt_test", resume=False)
return os.path.join(os.path.abspath("results"), "hyperopt_test")


Expand Down Expand Up @@ -167,3 +172,21 @@ def _get_default_system_config():
"object_store_full_delay_ms": 100,
}
return system_config


def _get_sample_config():
"""Returns a sample config."""
input_features = [
text_feature(name="utterance", encoder={"cell_type": "lstm", "reduce_output": "sum"}),
category_feature(encoder={"vocab_size": 2}, reduce_input="sum"),
]
output_features = [category_feature(decoder={"vocab_size": 2}, reduce_input="sum")]
csv_filename = uuid.uuid4().hex[:10].upper() + ".csv"
rel_path = generate_data(input_features, output_features, csv_filename)
config = {
INPUT_FEATURES: input_features,
OUTPUT_FEATURES: output_features,
COMBINER: {TYPE: "concat", "num_fc_layers": 2},
TRAINER: {EPOCHS: 2, "learning_rate": 0.001},
}
return config, rel_path
41 changes: 37 additions & 4 deletions tests/integration_tests/test_visualization_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -844,10 +844,17 @@ def test_frequency_vs_f1_vis_api(experiment_to_use):


@pytest.mark.distributed
def test_hyperopt_report_vis_api(hyperopt_results, tmpdir):
def test_hyperopt_report_vis_api(hyperopt_results_multiple_parameters, tmpdir):
vis_dir = os.path.join(tmpdir, "visualizations")

visualize.hyperopt_report(os.path.join(hyperopt_results, HYPEROPT_STATISTICS_FILE_NAME), output_directory=vis_dir)
# Ensure visualizations directory is empty before creating plots
if os.path.exists(vis_dir):
for f in os.listdir(vis_dir):
os.remove(os.path.join(vis_dir, f))

visualize.hyperopt_report(
os.path.join(hyperopt_results_multiple_parameters, HYPEROPT_STATISTICS_FILE_NAME), output_directory=vis_dir
)

# test for creation of output directory
assert os.path.isdir(vis_dir)
Expand All @@ -857,13 +864,39 @@ def test_hyperopt_report_vis_api(hyperopt_results, tmpdir):


@pytest.mark.distributed
def test_hyperopt_hiplot_vis_api(hyperopt_results, tmpdir):
def test_hyperopt_hiplot_vis_api(hyperopt_results_multiple_parameters, tmpdir):
vis_dir = os.path.join(tmpdir, "visualizations")

visualize.hyperopt_hiplot(os.path.join(hyperopt_results, HYPEROPT_STATISTICS_FILE_NAME), output_directory=vis_dir)
# Ensure visualizations directory is empty before creating plots
if os.path.exists(vis_dir):
for f in os.listdir(vis_dir):
os.remove(os.path.join(vis_dir, f))

visualize.hyperopt_hiplot(
os.path.join(hyperopt_results_multiple_parameters, HYPEROPT_STATISTICS_FILE_NAME), output_directory=vis_dir
)

# test for creation of output directory
assert os.path.isdir(vis_dir)

# test for generatated html page
assert os.path.isfile(os.path.join(vis_dir, "hyperopt_hiplot.html"))


@pytest.mark.distributed
def test_hyperopt_report_vis_api_no_pairplot(hyperopt_results_single_parameter, tmpdir):
vis_dir = os.path.join(tmpdir, "visualizations")

# Ensure visualizations directory is empty before creating plots
if os.path.exists(vis_dir):
for f in os.listdir(vis_dir):
os.remove(os.path.join(vis_dir, f))

visualize.hyperopt_report(
os.path.join(hyperopt_results_single_parameter, HYPEROPT_STATISTICS_FILE_NAME), output_directory=vis_dir
)

figure_cnt = glob.glob(os.path.join(vis_dir, "*"))

# Only create plot for single parameter and skip pairplot creation
assert len(figure_cnt) == 1