From 54488b1016b8e104df3ec2556e21409a6bbd57bf Mon Sep 17 00:00:00 2001 From: Fynn Petersen-Frey <10599762+fyfrey@users.noreply.github.com> Date: Sat, 20 Jul 2024 21:59:27 +0200 Subject: [PATCH] feat(ml): improved ARM-NN support (#11233) --- .../docs/features/ml-hardware-acceleration.md | 1 + docs/docs/install/environment-variables.md | 27 +++++---- machine-learning/Dockerfile | 4 +- machine-learning/ann/ann.cpp | 59 ++++++++++++++----- machine-learning/ann/ann.py | 2 + machine-learning/app/config.py | 2 + machine-learning/app/sessions/ann.py | 3 +- machine-learning/app/test_main.py | 4 +- 8 files changed, 70 insertions(+), 32 deletions(-) diff --git a/docs/docs/features/ml-hardware-acceleration.md b/docs/docs/features/ml-hardware-acceleration.md index 2bcb5ee8e8..9f2d33cc35 100644 --- a/docs/docs/features/ml-hardware-acceleration.md +++ b/docs/docs/features/ml-hardware-acceleration.md @@ -32,6 +32,7 @@ You do not need to redo any machine learning jobs after enabling hardware accele - Where and how you can get this file depends on device and vendor, but typically, the device vendor also supplies these - The `hwaccel.ml.yml` file assumes the path to it is `/usr/lib/libmali.so`, so update accordingly if it is elsewhere - The `hwaccel.ml.yml` file assumes an additional file `/lib/firmware/mali_csffw.bin`, so update accordingly if your device's driver does not require this file +- Optional: Configure your `.env` file, see [environment variables](/docs/install/environment-variables) for ARM NN specific settings #### CUDA diff --git a/docs/docs/install/environment-variables.md b/docs/docs/install/environment-variables.md index e13282b45e..4f09818d8f 100644 --- a/docs/docs/install/environment-variables.md +++ b/docs/docs/install/environment-variables.md @@ -156,18 +156,21 @@ Redis (Sentinel) URL example JSON before encoding: ## Machine Learning -| Variable | Description | Default | Containers | -| :----------------------------------------------- | :------------------------------------------------------------------- | :-----------------------------------: | :--------------- | -| `MACHINE_LEARNING_MODEL_TTL` | Inactivity time (s) before a model is unloaded (disabled if \<= 0) | `300` | machine learning | -| `MACHINE_LEARNING_MODEL_TTL_POLL_S` | Interval (s) between checks for the model TTL (disabled if \<= 0) | `10` | machine learning | -| `MACHINE_LEARNING_CACHE_FOLDER` | Directory where models are downloaded | `/cache` | machine learning | -| `MACHINE_LEARNING_REQUEST_THREADS`\*1 | Thread count of the request thread pool (disabled if \<= 0) | number of CPU cores | machine learning | -| `MACHINE_LEARNING_MODEL_INTER_OP_THREADS` | Number of parallel model operations | `1` | machine learning | -| `MACHINE_LEARNING_MODEL_INTRA_OP_THREADS` | Number of threads for each model operation | `2` | machine learning | -| `MACHINE_LEARNING_WORKERS`\*2 | Number of worker processes to spawn | `1` | machine learning | -| `MACHINE_LEARNING_WORKER_TIMEOUT` | Maximum time (s) of unresponsiveness before a worker is killed | `120` (`300` if using OpenVINO image) | machine learning | -| `MACHINE_LEARNING_PRELOAD__CLIP` | Name of a CLIP model to be preloaded and kept in cache | | machine learning | -| `MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION` | Name of a facial recognition model to be preloaded and kept in cache | | machine learning | +| Variable | Description | Default | Containers | +| :----------------------------------------------- | :-------------------------------------------------------------------------------------------------- | :-----------------------------------: | :--------------- | +| `MACHINE_LEARNING_MODEL_TTL` | Inactivity time (s) before a model is unloaded (disabled if \<= 0) | `300` | machine learning | +| `MACHINE_LEARNING_MODEL_TTL_POLL_S` | Interval (s) between checks for the model TTL (disabled if \<= 0) | `10` | machine learning | +| `MACHINE_LEARNING_CACHE_FOLDER` | Directory where models are downloaded | `/cache` | machine learning | +| `MACHINE_LEARNING_REQUEST_THREADS`\*1 | Thread count of the request thread pool (disabled if \<= 0) | number of CPU cores | machine learning | +| `MACHINE_LEARNING_MODEL_INTER_OP_THREADS` | Number of parallel model operations | `1` | machine learning | +| `MACHINE_LEARNING_MODEL_INTRA_OP_THREADS` | Number of threads for each model operation | `2` | machine learning | +| `MACHINE_LEARNING_WORKERS`\*2 | Number of worker processes to spawn | `1` | machine learning | +| `MACHINE_LEARNING_WORKER_TIMEOUT` | Maximum time (s) of unresponsiveness before a worker is killed | `120` (`300` if using OpenVINO image) | machine learning | +| `MACHINE_LEARNING_PRELOAD__CLIP` | Name of a CLIP model to be preloaded and kept in cache | | machine learning | +| `MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION` | Name of a facial recognition model to be preloaded and kept in cache | | machine learning | +| `MACHINE_LEARNING_ANN` | Enable ARM-NN hardware acceleration if supported | `True` | machine learning | +| `MACHINE_LEARNING_ANN_FP16_TURBO` | Execute operations in FP16 precision: increasing speed, reducing precision (applies only to ARM-NN) | `False` | machine learning | +| `MACHINE_LEARNING_ANN_TUNING_LEVEL` | ARM-NN GPU tuning level (1: rapid, 2: normal, 3: exhaustive) | `2` | machine learning | \*1: It is recommended to begin with this parameter when changing the concurrency levels of the machine learning service and then tune the other ones. diff --git a/machine-learning/Dockerfile b/machine-learning/Dockerfile index 92d4850f38..bd4f0de870 100644 --- a/machine-learning/Dockerfile +++ b/machine-learning/Dockerfile @@ -13,7 +13,7 @@ FROM builder-cpu as builder-armnn ENV ARMNN_PATH=/opt/armnn COPY ann /opt/ann RUN mkdir /opt/armnn && \ - curl -SL "https://github.com/ARM-software/armnn/releases/download/v23.11/ArmNN-linux-aarch64.tar.gz" | tar -zx -C /opt/armnn && \ + curl -SL "https://github.com/ARM-software/armnn/releases/download/v24.05/ArmNN-linux-aarch64.tar.gz" | tar -zx -C /opt/armnn && \ cd /opt/ann && \ sh build.sh @@ -54,7 +54,7 @@ FROM prod-cpu as prod-armnn ENV LD_LIBRARY_PATH=/opt/armnn -RUN apt-get update && apt-get install -y --no-install-recommends ocl-icd-libopencl1 mesa-opencl-icd && \ +RUN apt-get update && apt-get install -y --no-install-recommends ocl-icd-libopencl1 mesa-opencl-icd libgomp1 && \ rm -rf /var/lib/apt/lists/* && \ mkdir --parents /etc/OpenCL/vendors && \ echo "/usr/lib/libmali.so" > /etc/OpenCL/vendors/mali.icd && \ diff --git a/machine-learning/ann/ann.cpp b/machine-learning/ann/ann.cpp index d0010f690b..5771759508 100644 --- a/machine-learning/ann/ann.cpp +++ b/machine-learning/ann/ann.cpp @@ -48,21 +48,22 @@ public: bool saveCachedNetwork, const char *cachedNetworkPath) { - INetworkPtr network = loadModel(modelPath); - IOptimizedNetworkPtr optNet = OptimizeNetwork(network.get(), fastMath, fp16, saveCachedNetwork, cachedNetworkPath); - const IOInfos infos = getIOInfos(optNet.get()); - NetworkId netId; - mutex.lock(); - Status status = runtime->LoadNetwork(netId, std::move(optNet)); - mutex.unlock(); - if (status != Status::Success) + NetworkId netId = -2; + while (netId == -2) { - return -1; + try + { + netId = loadInternal(modelPath, fastMath, fp16, saveCachedNetwork, cachedNetworkPath); + } + catch (InvalidArgumentException e) + { + // fp16 models do not support the forced fp16-turbo (runtime fp32->fp16 conversion) + if (fp16) + fp16 = false; + else + netId = -1; + } } - spinLock.lock(); - ioInfos[netId] = infos; - mutexes.emplace(netId, std::make_unique()); - spinLock.unlock(); return netId; } @@ -117,6 +118,8 @@ public: Ann(int tuningLevel, const char *tuningFile) { IRuntime::CreationOptions runtimeOptions; + runtimeOptions.m_ProfilingOptions.m_EnableProfiling = false; + runtimeOptions.m_ProfilingOptions.m_TimelineEnabled = false; BackendOptions backendOptions{"GpuAcc", { {"TuningLevel", tuningLevel}, @@ -133,6 +136,30 @@ public: }; private: + int loadInternal(const char *modelPath, + bool fastMath, + bool fp16, + bool saveCachedNetwork, + const char *cachedNetworkPath) + { + NetworkId netId = -1; + INetworkPtr network = loadModel(modelPath); + IOptimizedNetworkPtr optNet = OptimizeNetwork(network.get(), fastMath, fp16, saveCachedNetwork, cachedNetworkPath); + const IOInfos infos = getIOInfos(optNet.get()); + mutex.lock(); + Status status = runtime->LoadNetwork(netId, std::move(optNet)); + mutex.unlock(); + if (status != Status::Success) + { + return -1; + } + spinLock.lock(); + ioInfos[netId] = infos; + mutexes.emplace(netId, std::make_unique()); + spinLock.unlock(); + return netId; + } + INetworkPtr loadModel(const char *modelPath) { const auto path = std::string(modelPath); @@ -172,6 +199,8 @@ private: options.SetReduceFp32ToFp16(fp16); options.SetShapeInferenceMethod(shapeInferenceMethod); options.SetAllowExpandedDims(allowExpandedDims); + options.SetDebugToFileEnabled(false); + options.SetProfilingEnabled(false); BackendOptions gpuAcc("GpuAcc", {{"FastMathEnabled", fastMath}}); if (cachedNetworkPath) @@ -232,8 +261,8 @@ private: IRuntime *runtime; std::map ioInfos; std::map> mutexes; // mutex per network to not execute the same the same network concurrently - std::mutex mutex; // global mutex for load/unload calls to the runtime - SpinLock spinLock; // fast spin lock to guard access to the ioInfos and mutexes maps + std::mutex mutex; // global mutex for load/unload calls to the runtime + SpinLock spinLock; // fast spin lock to guard access to the ioInfos and mutexes maps }; extern "C" void *init(int logLevel, int tuningLevel, const char *tuningFile) diff --git a/machine-learning/ann/ann.py b/machine-learning/ann/ann.py index d3cb8bc821..a6667d50fb 100644 --- a/machine-learning/ann/ann.py +++ b/machine-learning/ann/ann.py @@ -120,6 +120,8 @@ class Ann(metaclass=_Singleton): save_cached_network, cached_network_path.encode() if cached_network_path is not None else None, ) + if net_id < 0: + raise ValueError("Cannot load model!") self.input_shapes[net_id] = tuple( self.shape(net_id, input=True, index=i) for i in range(self.tensors(net_id, input=True)) diff --git a/machine-learning/app/config.py b/machine-learning/app/config.py index 9b98eecded..af2d0aa4b9 100644 --- a/machine-learning/app/config.py +++ b/machine-learning/app/config.py @@ -30,6 +30,8 @@ class Settings(BaseSettings): model_inter_op_threads: int = 0 model_intra_op_threads: int = 0 ann: bool = True + ann_fp16_turbo: bool = False + ann_tuning_level: int = 2 preload: PreloadModelData | None = None class Config: diff --git a/machine-learning/app/sessions/ann.py b/machine-learning/app/sessions/ann.py index 618d6e9929..1882cdf70a 100644 --- a/machine-learning/app/sessions/ann.py +++ b/machine-learning/app/sessions/ann.py @@ -20,12 +20,13 @@ class AnnSession: def __init__(self, model_path: Path, cache_dir: Path = settings.cache_folder) -> None: self.model_path = model_path self.cache_dir = cache_dir - self.ann = Ann(tuning_level=3, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix()) + self.ann = Ann(tuning_level=settings.ann_tuning_level, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix()) log.info("Loading ANN model %s ...", model_path) self.model = self.ann.load( model_path.as_posix(), cached_network_path=model_path.with_suffix(".anncache").as_posix(), + fp16=settings.ann_fp16_turbo, ) log.info("Loaded ANN model with ID %d", self.model) diff --git a/machine-learning/app/test_main.py b/machine-learning/app/test_main.py index 2c81421e52..79bdb1321b 100644 --- a/machine-learning/app/test_main.py +++ b/machine-learning/app/test_main.py @@ -268,9 +268,9 @@ class TestAnnSession: AnnSession(model_path, cache_dir) - ann_session.assert_called_once_with(tuning_level=3, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix()) + ann_session.assert_called_once_with(tuning_level=2, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix()) ann_session.return_value.load.assert_called_once_with( - model_path.as_posix(), cached_network_path=model_path.with_suffix(".anncache").as_posix() + model_path.as_posix(), cached_network_path=model_path.with_suffix(".anncache").as_posix(), fp16=False ) info.assert_has_calls( [