databricks · tgale96 · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,6 @@
-FROM nvcr.io/nvidia/pytorch:23.01-py3
+FROM nvcr.io/nvidia/pytorch:23.09-py3
 
-RUN pip install git+https://github.com/stanford-futuredata/stk.git@main
+RUN pip install stanford-stk==0.0.6
 
 RUN pip install flash-attn
 

diff --git a/README.md b/README.md
@@ -12,11 +12,19 @@ MegaBlocks dMoEs outperform MoEs trained with [Tutel](https://github.com/microso
 
 # :building_construction: Installation
 
-Note: this assumes you have `numpy` and `torch` installed
+NOTE: This assumes you have `numpy` and `torch` installed.
 
-**Training models with Megatron-LM:** We recommend using NGC's [`nvcr.io/nvidia/pytorch:23.01-py3`](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) PyTorch container. The [Dockerfile](Dockerfile) builds on this image with additional dependencies. To build the image, run `docker build . -t megablocks-dev` and then `bash docker.sh` to launch the container. Once inside the container, install MegaBlocks with `pip install .`. See [Usage](#steam_locomotive-usage) for instructions on training MoEs with MegaBlocks + Megatron-LM.
+**Training models with Megatron-LM:** We recommend using NGC's [`nvcr.io/nvidia/pytorch:23.09-py3`](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) PyTorch container. The [Dockerfile](Dockerfile) builds on this image with additional dependencies. To build the image, run `docker build . -t megablocks-dev` and then `bash docker.sh` to launch the container. Once inside the container, install MegaBlocks with `pip install .`. See [Usage](#steam_locomotive-usage) for instructions on training MoEs with MegaBlocks + Megatron-LM.
 
-**Using MegaBlocks in other packages:** To install the MegaBlocks package for use in other frameworks, run `pip install megablocks`.
+**Using MegaBlocks in other packages:** To install the MegaBlocks package for use in other frameworks, run `pip install megablocks`. For example, [Mixtral-8x7B](https://mistral.ai/news/mixtral-of-experts/) can be run with [vLLM](https://github.com/vllm-project/vllm) + MegaBlocks with this installation method.
+
+**Extras:*** MegaBlocks has optional dependencies that enable additional features.
+
+Installing `megablocks[quant]` enables configurable quantization of saved activations in the dMoE layer to save memory during training. The degree of quantization is controlled via the `quantize_inputs_num_bits`, `quantize_rematerialize_num_bits` and `quantize_scatter_num_bits` [arguments](https://github.com/stanford-futuredata/megablocks/blob/main/megablocks/layers/arguments.py).
+
+Installing `megablocks[gg]` enables dMoE computation with grouped GEMM. This feature is enabled by setting the `grouped_mlp` argument to the dMoE layer. This is currently our recommended path for Hopper-generation GPUs.
+
+MegaBlocks can be installed with all dependencies via the `megablocks[all]` package.
 
 # :steam_locomotive: Usage
 

diff --git a/megablocks/layers/dmoe_test.py b/megablocks/layers/dmoe_test.py
@@ -2,6 +2,7 @@
 from functools import partial
 
 from absl.testing import parameterized
+from megablocks import grouped_gemm_util as gg
 from megablocks import turbo_util as turbo
 from megablocks.layers.arguments import Arguments
 from megablocks.layers import dmoe
@@ -74,7 +75,7 @@ def test_modules(
 
 _FORWARD_TESTS_GROUPED_MLP = tuple([
     p + (-1, -1, True) for p in _FORWARD_TESTS_NO_QUANTIZE
-])
+]) if gg.grouped_gemm_is_available() else ()
 
 # quantization tests; assorted small sizes, systematic bitwidths
 _FORWARD_TESTS_QUANTIZE_HIDDEN = (

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,2 @@
-torch
-numpy
-absl-py
-stanford-stk @ git+https://github.com/stanford-futuredata/stk.git@main
-grouped_gemm @ git+https://github.com/tgale96/grouped_gemm@main
-mosaicml-turbo==0.0.4
+stanford-stk>=0.0.6
+triton==2.1.0
diff --git a/setup.py b/setup.py
@@ -21,15 +21,22 @@
 ]
 
 install_requires=[
-    'stanford-stk @ git+https://github.com/stanford-futuredata/stk.git@main',
-    'grouped_gemm @ git+https://github.com/tgale96/grouped_gemm@main',
-    'mosaicml-turbo==0.0.4',
+    "triton==2.1.0",
+    "stanford-stk>=0.0.6",
 ]
 
 extra_deps = {}
 
-extra_deps['dev'] = [
-    'absl-py',
+extra_deps["gg"] = [
+    "grouped_gemm",
+]
+
+extra_deps["quant"] = [
+    "mosaicml-turbo==0.0.4",
-    "mosaicml-turbo==0.0.4",
+    "mosaicml-turbo==0.0.5",
-    "mosaicml-turbo==0.0.4",
+    "mosaicml-turbo==0.0.5",
+]
+
+extra_deps["dev"] = [
+    "absl-py",
 ]
 
 extra_deps['all'] = set(dep for deps in extra_deps.values() for dep in deps)