mirror of
https://github.com/immich-app/immich.git
synced 2024-12-31 00:43:56 -05:00
feat(ml): improved ARM-NN support (#11233)
This commit is contained in:
parent
7c3326b662
commit
54488b1016
8 changed files with 70 additions and 32 deletions
|
@ -32,6 +32,7 @@ You do not need to redo any machine learning jobs after enabling hardware accele
|
|||
- Where and how you can get this file depends on device and vendor, but typically, the device vendor also supplies these
|
||||
- The `hwaccel.ml.yml` file assumes the path to it is `/usr/lib/libmali.so`, so update accordingly if it is elsewhere
|
||||
- The `hwaccel.ml.yml` file assumes an additional file `/lib/firmware/mali_csffw.bin`, so update accordingly if your device's driver does not require this file
|
||||
- Optional: Configure your `.env` file, see [environment variables](/docs/install/environment-variables) for ARM NN specific settings
|
||||
|
||||
#### CUDA
|
||||
|
||||
|
|
|
@ -156,18 +156,21 @@ Redis (Sentinel) URL example JSON before encoding:
|
|||
|
||||
## Machine Learning
|
||||
|
||||
| Variable | Description | Default | Containers |
|
||||
| :----------------------------------------------- | :------------------------------------------------------------------- | :-----------------------------------: | :--------------- |
|
||||
| `MACHINE_LEARNING_MODEL_TTL` | Inactivity time (s) before a model is unloaded (disabled if \<= 0) | `300` | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_TTL_POLL_S` | Interval (s) between checks for the model TTL (disabled if \<= 0) | `10` | machine learning |
|
||||
| `MACHINE_LEARNING_CACHE_FOLDER` | Directory where models are downloaded | `/cache` | machine learning |
|
||||
| `MACHINE_LEARNING_REQUEST_THREADS`<sup>\*1</sup> | Thread count of the request thread pool (disabled if \<= 0) | number of CPU cores | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_INTER_OP_THREADS` | Number of parallel model operations | `1` | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_INTRA_OP_THREADS` | Number of threads for each model operation | `2` | machine learning |
|
||||
| `MACHINE_LEARNING_WORKERS`<sup>\*2</sup> | Number of worker processes to spawn | `1` | machine learning |
|
||||
| `MACHINE_LEARNING_WORKER_TIMEOUT` | Maximum time (s) of unresponsiveness before a worker is killed | `120` (`300` if using OpenVINO image) | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__CLIP` | Name of a CLIP model to be preloaded and kept in cache | | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION` | Name of a facial recognition model to be preloaded and kept in cache | | machine learning |
|
||||
| Variable | Description | Default | Containers |
|
||||
| :----------------------------------------------- | :-------------------------------------------------------------------------------------------------- | :-----------------------------------: | :--------------- |
|
||||
| `MACHINE_LEARNING_MODEL_TTL` | Inactivity time (s) before a model is unloaded (disabled if \<= 0) | `300` | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_TTL_POLL_S` | Interval (s) between checks for the model TTL (disabled if \<= 0) | `10` | machine learning |
|
||||
| `MACHINE_LEARNING_CACHE_FOLDER` | Directory where models are downloaded | `/cache` | machine learning |
|
||||
| `MACHINE_LEARNING_REQUEST_THREADS`<sup>\*1</sup> | Thread count of the request thread pool (disabled if \<= 0) | number of CPU cores | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_INTER_OP_THREADS` | Number of parallel model operations | `1` | machine learning |
|
||||
| `MACHINE_LEARNING_MODEL_INTRA_OP_THREADS` | Number of threads for each model operation | `2` | machine learning |
|
||||
| `MACHINE_LEARNING_WORKERS`<sup>\*2</sup> | Number of worker processes to spawn | `1` | machine learning |
|
||||
| `MACHINE_LEARNING_WORKER_TIMEOUT` | Maximum time (s) of unresponsiveness before a worker is killed | `120` (`300` if using OpenVINO image) | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__CLIP` | Name of a CLIP model to be preloaded and kept in cache | | machine learning |
|
||||
| `MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION` | Name of a facial recognition model to be preloaded and kept in cache | | machine learning |
|
||||
| `MACHINE_LEARNING_ANN` | Enable ARM-NN hardware acceleration if supported | `True` | machine learning |
|
||||
| `MACHINE_LEARNING_ANN_FP16_TURBO` | Execute operations in FP16 precision: increasing speed, reducing precision (applies only to ARM-NN) | `False` | machine learning |
|
||||
| `MACHINE_LEARNING_ANN_TUNING_LEVEL` | ARM-NN GPU tuning level (1: rapid, 2: normal, 3: exhaustive) | `2` | machine learning |
|
||||
|
||||
\*1: It is recommended to begin with this parameter when changing the concurrency levels of the machine learning service and then tune the other ones.
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ FROM builder-cpu as builder-armnn
|
|||
ENV ARMNN_PATH=/opt/armnn
|
||||
COPY ann /opt/ann
|
||||
RUN mkdir /opt/armnn && \
|
||||
curl -SL "https://github.com/ARM-software/armnn/releases/download/v23.11/ArmNN-linux-aarch64.tar.gz" | tar -zx -C /opt/armnn && \
|
||||
curl -SL "https://github.com/ARM-software/armnn/releases/download/v24.05/ArmNN-linux-aarch64.tar.gz" | tar -zx -C /opt/armnn && \
|
||||
cd /opt/ann && \
|
||||
sh build.sh
|
||||
|
||||
|
@ -54,7 +54,7 @@ FROM prod-cpu as prod-armnn
|
|||
|
||||
ENV LD_LIBRARY_PATH=/opt/armnn
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends ocl-icd-libopencl1 mesa-opencl-icd && \
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends ocl-icd-libopencl1 mesa-opencl-icd libgomp1 && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
mkdir --parents /etc/OpenCL/vendors && \
|
||||
echo "/usr/lib/libmali.so" > /etc/OpenCL/vendors/mali.icd && \
|
||||
|
|
|
@ -48,21 +48,22 @@ public:
|
|||
bool saveCachedNetwork,
|
||||
const char *cachedNetworkPath)
|
||||
{
|
||||
INetworkPtr network = loadModel(modelPath);
|
||||
IOptimizedNetworkPtr optNet = OptimizeNetwork(network.get(), fastMath, fp16, saveCachedNetwork, cachedNetworkPath);
|
||||
const IOInfos infos = getIOInfos(optNet.get());
|
||||
NetworkId netId;
|
||||
mutex.lock();
|
||||
Status status = runtime->LoadNetwork(netId, std::move(optNet));
|
||||
mutex.unlock();
|
||||
if (status != Status::Success)
|
||||
NetworkId netId = -2;
|
||||
while (netId == -2)
|
||||
{
|
||||
return -1;
|
||||
try
|
||||
{
|
||||
netId = loadInternal(modelPath, fastMath, fp16, saveCachedNetwork, cachedNetworkPath);
|
||||
}
|
||||
catch (InvalidArgumentException e)
|
||||
{
|
||||
// fp16 models do not support the forced fp16-turbo (runtime fp32->fp16 conversion)
|
||||
if (fp16)
|
||||
fp16 = false;
|
||||
else
|
||||
netId = -1;
|
||||
}
|
||||
}
|
||||
spinLock.lock();
|
||||
ioInfos[netId] = infos;
|
||||
mutexes.emplace(netId, std::make_unique<std::mutex>());
|
||||
spinLock.unlock();
|
||||
return netId;
|
||||
}
|
||||
|
||||
|
@ -117,6 +118,8 @@ public:
|
|||
Ann(int tuningLevel, const char *tuningFile)
|
||||
{
|
||||
IRuntime::CreationOptions runtimeOptions;
|
||||
runtimeOptions.m_ProfilingOptions.m_EnableProfiling = false;
|
||||
runtimeOptions.m_ProfilingOptions.m_TimelineEnabled = false;
|
||||
BackendOptions backendOptions{"GpuAcc",
|
||||
{
|
||||
{"TuningLevel", tuningLevel},
|
||||
|
@ -133,6 +136,30 @@ public:
|
|||
};
|
||||
|
||||
private:
|
||||
int loadInternal(const char *modelPath,
|
||||
bool fastMath,
|
||||
bool fp16,
|
||||
bool saveCachedNetwork,
|
||||
const char *cachedNetworkPath)
|
||||
{
|
||||
NetworkId netId = -1;
|
||||
INetworkPtr network = loadModel(modelPath);
|
||||
IOptimizedNetworkPtr optNet = OptimizeNetwork(network.get(), fastMath, fp16, saveCachedNetwork, cachedNetworkPath);
|
||||
const IOInfos infos = getIOInfos(optNet.get());
|
||||
mutex.lock();
|
||||
Status status = runtime->LoadNetwork(netId, std::move(optNet));
|
||||
mutex.unlock();
|
||||
if (status != Status::Success)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
spinLock.lock();
|
||||
ioInfos[netId] = infos;
|
||||
mutexes.emplace(netId, std::make_unique<std::mutex>());
|
||||
spinLock.unlock();
|
||||
return netId;
|
||||
}
|
||||
|
||||
INetworkPtr loadModel(const char *modelPath)
|
||||
{
|
||||
const auto path = std::string(modelPath);
|
||||
|
@ -172,6 +199,8 @@ private:
|
|||
options.SetReduceFp32ToFp16(fp16);
|
||||
options.SetShapeInferenceMethod(shapeInferenceMethod);
|
||||
options.SetAllowExpandedDims(allowExpandedDims);
|
||||
options.SetDebugToFileEnabled(false);
|
||||
options.SetProfilingEnabled(false);
|
||||
|
||||
BackendOptions gpuAcc("GpuAcc", {{"FastMathEnabled", fastMath}});
|
||||
if (cachedNetworkPath)
|
||||
|
@ -232,8 +261,8 @@ private:
|
|||
IRuntime *runtime;
|
||||
std::map<NetworkId, IOInfos> ioInfos;
|
||||
std::map<NetworkId, std::unique_ptr<std::mutex>> mutexes; // mutex per network to not execute the same the same network concurrently
|
||||
std::mutex mutex; // global mutex for load/unload calls to the runtime
|
||||
SpinLock spinLock; // fast spin lock to guard access to the ioInfos and mutexes maps
|
||||
std::mutex mutex; // global mutex for load/unload calls to the runtime
|
||||
SpinLock spinLock; // fast spin lock to guard access to the ioInfos and mutexes maps
|
||||
};
|
||||
|
||||
extern "C" void *init(int logLevel, int tuningLevel, const char *tuningFile)
|
||||
|
|
|
@ -120,6 +120,8 @@ class Ann(metaclass=_Singleton):
|
|||
save_cached_network,
|
||||
cached_network_path.encode() if cached_network_path is not None else None,
|
||||
)
|
||||
if net_id < 0:
|
||||
raise ValueError("Cannot load model!")
|
||||
|
||||
self.input_shapes[net_id] = tuple(
|
||||
self.shape(net_id, input=True, index=i) for i in range(self.tensors(net_id, input=True))
|
||||
|
|
|
@ -30,6 +30,8 @@ class Settings(BaseSettings):
|
|||
model_inter_op_threads: int = 0
|
||||
model_intra_op_threads: int = 0
|
||||
ann: bool = True
|
||||
ann_fp16_turbo: bool = False
|
||||
ann_tuning_level: int = 2
|
||||
preload: PreloadModelData | None = None
|
||||
|
||||
class Config:
|
||||
|
|
|
@ -20,12 +20,13 @@ class AnnSession:
|
|||
def __init__(self, model_path: Path, cache_dir: Path = settings.cache_folder) -> None:
|
||||
self.model_path = model_path
|
||||
self.cache_dir = cache_dir
|
||||
self.ann = Ann(tuning_level=3, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix())
|
||||
self.ann = Ann(tuning_level=settings.ann_tuning_level, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix())
|
||||
|
||||
log.info("Loading ANN model %s ...", model_path)
|
||||
self.model = self.ann.load(
|
||||
model_path.as_posix(),
|
||||
cached_network_path=model_path.with_suffix(".anncache").as_posix(),
|
||||
fp16=settings.ann_fp16_turbo,
|
||||
)
|
||||
log.info("Loaded ANN model with ID %d", self.model)
|
||||
|
||||
|
|
|
@ -268,9 +268,9 @@ class TestAnnSession:
|
|||
|
||||
AnnSession(model_path, cache_dir)
|
||||
|
||||
ann_session.assert_called_once_with(tuning_level=3, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix())
|
||||
ann_session.assert_called_once_with(tuning_level=2, tuning_file=(cache_dir / "gpu-tuning.ann").as_posix())
|
||||
ann_session.return_value.load.assert_called_once_with(
|
||||
model_path.as_posix(), cached_network_path=model_path.with_suffix(".anncache").as_posix()
|
||||
model_path.as_posix(), cached_network_path=model_path.with_suffix(".anncache").as_posix(), fp16=False
|
||||
)
|
||||
info.assert_has_calls(
|
||||
[
|
||||
|
|
Loading…
Reference in a new issue