Skip to content
Snippets Groups Projects
Unverified Commit ccc7c9b8 authored by Adam J. Stewart's avatar Adam J. Stewart Committed by GitHub
Browse files

py-horovod: fix compilation of ~cuda (#15719)

* py-horovod: fix compilation of ~cuda

* Rewrite py-horovod with only 3 variants

* Add upstream patch to workaround compilation issue
parent 6747ecde
No related branches found
No related tags found
No related merge requests found
From 717e72f91f02d1dc3c859719ef1d804b10f88017 Mon Sep 17 00:00:00 2001
From: Nicolas V Castet <nvcastet@us.ibm.com>
Date: Mon, 30 Mar 2020 12:47:50 -0500
Subject: [PATCH] Add extra preprocessor guard for FMA optimization
Fixes #1832
Signed-off-by: Nicolas V Castet <nvcastet@us.ibm.com>
---
horovod/common/ops/adasum/adasum.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/horovod/common/ops/adasum/adasum.h b/horovod/common/ops/adasum/adasum.h
index 0330f5850..876f7f12b 100644
--- a/horovod/common/ops/adasum/adasum.h
+++ b/horovod/common/ops/adasum/adasum.h
@@ -19,7 +19,7 @@
#include <cstring>
#include <float.h>
-#if __AVX__ && __F16C__
+#if __AVX__ && __F16C__ && __FMA__
#include <emmintrin.h>
#include <immintrin.h>
#endif
@@ -104,7 +104,7 @@ template <typename Communicator_type> class Adasum {
int count, double& dotProduct,
double& anormsq, double& bnormsq,
int layerid) {
-#if __AVX__ && __F16C__
+#if __AVX__ && __F16C__ && __FMA__
if (horovod_datatype == DataType::HOROVOD_FLOAT16) {
ComputeDotAndNormSqrdsfp16((uint16_t*)a, (uint16_t*)b, count, dotProduct,
anormsq, bnormsq, layerid);
@@ -125,7 +125,7 @@ template <typename Communicator_type> class Adasum {
double acoeff, void* __restrict__ a,
double bcoeff, void* __restrict__ b,
int layerid) {
-#if __AVX__ && __F16C__
+#if __AVX__ && __F16C__ && __FMA__
if (horovod_datatype == DataType::HOROVOD_FLOAT16) {
ScaledAddfp16(count, acoeff, (uint16_t*)a, bcoeff, (uint16_t*)b, layerid);
} else
@@ -425,7 +425,7 @@ template <typename Communicator_type> class Adasum {
}
-#if __AVX__ && __F16C__
+#if __AVX__ && __F16C__ && __FMA__
inline void ComputeDotAndNormSqrdsfp16(const uint16_t* __restrict__ a,
const uint16_t* __restrict__ b,
int len, double& dotProduct,
...@@ -25,96 +25,106 @@ class PyHorovod(PythonPackage): ...@@ -25,96 +25,106 @@ class PyHorovod(PythonPackage):
version('0.16.3', tag='v0.16.3', submodules=True) version('0.16.3', tag='v0.16.3', submodules=True)
version('0.16.2', tag='v0.16.2', submodules=True) version('0.16.2', tag='v0.16.2', submodules=True)
# Deep learning frameworks # https://github.com/horovod/horovod/blob/master/docs/install.rst
variant('pytorch', default=True, description='Enables PyTorch') variant('frameworks', default='pytorch',
variant('tensorflow', default=False, description='Enables TensorFlow') description='Deep learning frameworks to build support for',
variant('mxnet', default=False, description='Enables Apache MXNet') values=('tensorflow', 'pytorch', 'mxnet', 'keras', 'spark'),
multi=True)
# Distributed support variant('controllers', default='mpi',
variant('gloo', default=False, description='Enables features related to distributed support') description='Controllers to coordinate work between processes',
variant('mpi', default=True, description='Enables MPI build') values=('mpi', 'gloo'), multi=True)
variant('tensor_ops', default='nccl',
# GPU support description='Framework to use for GPU/CPU operations',
variant('cuda', default=True, description='Enables CUDA build') values=('nccl', 'mpi', 'gloo', 'ccl'), multi=False)
variant('gpu_allreduce', default='mpi',
description='Backend to use for GPU_ALLREDUCE',
values=('mpi', 'nccl'), multi=False) # DDL support is deprecated
variant('gpu_allgather', default='mpi',
description='Backend to use for GPU_ALLGATHER',
values=('mpi',), multi=False)
variant('gpu_broadcast', default='mpi',
description='Backend to use for GPU_BROADCAST',
values=('mpi', 'nccl'), multi=False)
# Required dependencies # Required dependencies
depends_on('py-setuptools', type='build') depends_on('py-setuptools', type='build')
depends_on('py-cloudpickle', type=('build', 'run')) depends_on('py-cloudpickle', type=('build', 'run'))
depends_on('py-psutil', type=('build', 'run')) depends_on('py-psutil', type=('build', 'run'))
depends_on('py-pyyaml', type=('build', 'run')) depends_on('py-pyyaml', type=('build', 'run'))
depends_on('py-six', type=('build', 'run')) depends_on('py-six', type=('build', 'run'))
# Deep learning frameworks # Framework dependencies
depends_on('py-torch@0.4.0:', type=('build', 'run'), when='+pytorch') depends_on('py-tensorflow@1.1.0:', type=('build', 'link', 'run'), when='frameworks=tensorflow')
depends_on('py-torch+cuda', type=('build', 'run'), when='+pytorch+cuda') depends_on('py-torch@0.4.0:', type=('build', 'run'), when='frameworks=pytorch')
depends_on('py-cffi@1.4.0:', type=('build', 'run'), when='+pytorch') depends_on('py-torchvision', type=('build', 'run'), when='frameworks=pytorch')
depends_on('py-tensorflow@1.1.0:', type=('build', 'link', 'run'), when='+tensorflow') depends_on('py-cffi@1.4.0:', type=('build', 'run'), when='frameworks=pytorch')
depends_on('mxnet@1.4.0:+python', type=('build', 'link', 'run'), when='+mxnet') depends_on('mxnet@1.4.1:+python', type=('build', 'link', 'run'), when='frameworks=mxnet')
depends_on('mxnet+cuda', type=('build', 'link', 'run'), when='+mxnet+cuda') depends_on('py-keras@2.0.8,2.1.2:', type=('build', 'run'), when='frameworks=keras')
depends_on('py-h5py@2.9:', type=('build', 'run'), when='frameworks=spark')
# Distributed support depends_on('py-numpy', type=('build', 'run'), when='frameworks=spark')
depends_on('py-petastorm@0.8.2', type=('build', 'run'), when='frameworks=spark')
depends_on('py-pyarrow@0.15.0:', type=('build', 'run'), when='frameworks=spark')
depends_on('py-pyspark@2.3.2:', type=('build', 'run'), when='frameworks=spark')
# Controller dependencies
depends_on('mpi', when='controllers=mpi')
# There does not appear to be a way to use an external Gloo installation # There does not appear to be a way to use an external Gloo installation
depends_on('cmake', type='build', when='+gloo') depends_on('cmake', type='build', when='controllers=gloo')
depends_on('mpi', when='+mpi')
depends_on('mpi', when='gpu_allreduce=mpi')
depends_on('mpi', when='gpu_allgather=mpi')
depends_on('mpi', when='gpu_broadcast=mpi')
# GPU support # Tensor Operations dependencies
depends_on('cuda', when='+cuda') depends_on('nccl', when='tensor_ops=nccl')
depends_on('nccl@2.0:', when='gpu_allreduce=nccl') depends_on('mpi', when='tensor_ops=mpi')
depends_on('nccl@2.0:', when='gpu_broadcast=nccl') # There does not appear to be a way to use an external Gloo installation
depends_on('cmake', type='build', when='tensor_ops=gloo')
# Test dependencies # Test dependencies
depends_on('py-mock', type='test') depends_on('py-mock', type='test')
depends_on('py-pytest', type='test') depends_on('py-pytest', type='test')
depends_on('py-pytest-forked', type='test') depends_on('py-pytest-forked', type='test')
conflicts('+gloo', when='platform=darwin', msg='Gloo cannot be compiled on MacOS') conflicts('controllers=gloo', when='platform=darwin', msg='Gloo cannot be compiled on MacOS')
conflicts('~gloo~mpi', msg='One of Gloo or MPI are required for Horovod to run')
conflicts('~pytorch~tensorflow~mxnet', msg='At least one deep learning backend is required') # https://github.com/horovod/horovod/pull/1835
patch('fma.patch', when='@0.19.0:0.19.1')
def setup_build_environment(self, env): def setup_build_environment(self, env):
# Deep learning frameworks # Frameworks
if '~pytorch' in self.spec: if 'frameworks=tensorflow' in self.spec:
env.set('HOROVOD_WITHOUT_PYTORCH', 1) env.set('HOROVOD_WITH_TENSORFLOW', 1)
if '~tensorflow' in self.spec: else:
env.set('HOROVOD_WITHOUT_TENSORFLOW', 1) env.set('HOROVOD_WITHOUT_TENSORFLOW', 1)
if '~mxnet' in self.spec: if 'frameworks=pytorch' in self.spec:
env.set('HOROVOD_WITH_PYTORCH', 1)
else:
env.set('HOROVOD_WITHOUT_PYTORCH', 1)
if 'frameworks=mxnet' in self.spec:
env.set('HOROVOD_WITH_MXNET', 1)
else:
env.set('HOROVOD_WITHOUT_MXNET', 1) env.set('HOROVOD_WITHOUT_MXNET', 1)
# Distributed support # Controllers
if '~gloo' in self.spec: if 'controllers=mpi' in self.spec:
env.set('HOROVOD_WITHOUT_GLOO', 1)
if '+mpi' in self.spec:
env.set('HOROVOD_WITH_MPI', 1) env.set('HOROVOD_WITH_MPI', 1)
else: else:
env.set('HOROVOD_WITHOUT_MPI', 1) env.set('HOROVOD_WITHOUT_MPI', 1)
if 'controllers=gloo' in self.spec:
env.set('HOROVOD_WITH_GLOO', 1)
else:
env.set('HOROVOD_WITHOUT_GLOO', 1)
# Tensor Operations
if 'tensor_ops=nccl' in self.spec:
env.set('HOROVOD_GPU', 'CUDA')
# GPU support
if '+cuda' in self.spec:
env.set('HOROVOD_CUDA_HOME', self.spec['cuda'].prefix) env.set('HOROVOD_CUDA_HOME', self.spec['cuda'].prefix)
env.set('HOROVOD_CUDA_INCLUDE', env.set('HOROVOD_CUDA_INCLUDE',
self.spec['cuda'].headers.directories[0]) self.spec['cuda'].headers.directories[0])
env.set('HOROVOD_CUDA_LIB', self.spec['cuda'].libs.directories[0]) env.set('HOROVOD_CUDA_LIB', self.spec['cuda'].libs.directories[0])
if '^nccl' in self.spec:
env.set('HOROVOD_NCCL_HOME', self.spec['nccl'].prefix) env.set('HOROVOD_NCCL_HOME', self.spec['nccl'].prefix)
env.set('HOROVOD_NCCL_INCLUDE', env.set('HOROVOD_NCCL_INCLUDE',
self.spec['nccl'].headers.directories[0]) self.spec['nccl'].headers.directories[0])
env.set('HOROVOD_NCCL_LIB', self.spec['nccl'].libs.directories[0]) env.set('HOROVOD_NCCL_LIB', self.spec['nccl'].libs.directories[0])
env.set('HOROVOD_GPU_ALLREDUCE',
self.spec.variants['gpu_allreduce'].value.upper()) env.set('HOROVOD_GPU_ALLREDUCE', 'NCCL')
env.set('HOROVOD_GPU_ALLGATHER', env.set('HOROVOD_GPU_BROADCAST', 'NCCL')
self.spec.variants['gpu_allgather'].value.upper()) else:
env.set('HOROVOD_GPU_BROADCAST', env.set('HOROVOD_CPU_OPERATIONS',
self.spec.variants['gpu_broadcast'].value.upper()) self.spec.variants['tensor_ops'].value.upper())
env.set('HOROVOD_ALLOW_MIXED_GPU_IMPL', 1)
@run_after('install')
@on_package_attributes(run_tests=True)
def install_test(self):
horovodrun = Executable(self.prefix.bin.horovodrun)
horovodrun('--check-build')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment