feat(ml): ML on Rockchip NPUs (#15241)

2025-03-18 02:31:28 -05:00 · 2025-03-18 00:04:08 +08:00 · 2025-03-18 00:04:08 +08:00 · 14c3b99c0f
commit 14c3b99c0f
parent 1e184a70f1
43 changed files with 2417 additions and 4726 deletions
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -49,7 +49,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        suffix: ["", "-cuda", "-openvino", "-armnn"]
+        suffix: ["", "-cuda", "-openvino", "-armnn","-rknn"]
    steps:
        - name: Login to GitHub Container Registry
          uses: docker/login-action@v3
@ -129,6 +129,9 @@ jobs:
            runner: ubuntu-24.04-arm
            device: armnn
            suffix: -armnn
+          - platforms: linux/arm64
+            device: rknn
+            suffix: -rknn

    steps:
      - name: Prepare
@ -454,4 +457,4 @@ jobs:
        run: exit 1
      - name: All jobs passed or skipped
        if: ${{ !(contains(needs.*.result, 'failure')) }}
-        run: echo "All jobs passed or skipped" && echo "${{ toJSON(needs.*.result) }}"
+        run: echo "All jobs passed or skipped" && echo "${{ toJSON(needs.*.result) }}"
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@ -95,12 +95,12 @@ services:
    image: immich-machine-learning-dev:latest
    # extends:
    #   file: hwaccel.ml.yml
-    #   service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
+    #   service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl, rknn] for accelerated inference
    build:
      context: ../machine-learning
      dockerfile: Dockerfile
      args:
-        - DEVICE=cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
+        - DEVICE=cpu # set to one of [armnn, cuda, openvino, openvino-wsl, rknn] for accelerated inference
    ports:
      - 3003:3003
    volumes:
--- a/docker/docker-compose.prod.yml
+++ b/docker/docker-compose.prod.yml
@ -38,12 +38,12 @@ services:
    image: immich-machine-learning:latest
    # extends:
    #   file: hwaccel.ml.yml
-    #   service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
+    #   service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl, rknn] for accelerated inference
    build:
      context: ../machine-learning
      dockerfile: Dockerfile
      args:
-        - DEVICE=cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference
+        - DEVICE=cpu # set to one of [armnn, cuda, openvino, openvino-wsl, rknn] for accelerated inference
    ports:
      - 3003:3003
    volumes:
@ -77,22 +77,12 @@ services:
      - 5432:5432
    healthcheck:
      test: >-
-        pg_isready --dbname="$${POSTGRES_DB}" --username="$${POSTGRES_USER}" || exit 1;
-        Chksum="$$(psql --dbname="$${POSTGRES_DB}" --username="$${POSTGRES_USER}" --tuples-only --no-align
-        --command='SELECT COALESCE(SUM(checksum_failures), 0) FROM pg_stat_database')";
-        echo "checksum failure count is $$Chksum";
-        [ "$$Chksum" = '0' ] || exit 1
+        pg_isready --dbname="$${POSTGRES_DB}" --username="$${POSTGRES_USER}" || exit 1; Chksum="$$(psql --dbname="$${POSTGRES_DB}" --username="$${POSTGRES_USER}" --tuples-only --no-align --command='SELECT COALESCE(SUM(checksum_failures), 0) FROM pg_stat_database')"; echo "checksum failure count is $$Chksum"; [ "$$Chksum" = '0' ] || exit 1
      interval: 5m
      start_interval: 30s
      start_period: 5m
    command: >-
-      postgres
-      -c shared_preload_libraries=vectors.so
-      -c 'search_path="$$user", public, vectors'
-      -c logging_collector=on
-      -c max_wal_size=2GB
-      -c shared_buffers=512MB
-      -c wal_compression=on
+      postgres -c shared_preload_libraries=vectors.so -c 'search_path="$$user", public, vectors' -c logging_collector=on -c max_wal_size=2GB -c shared_buffers=512MB -c wal_compression=on
    restart: always

  # set IMMICH_TELEMETRY_INCLUDE=all in .env to enable metrics
@ -109,7 +99,7 @@ services:
  # add data source for http://immich-prometheus:9090 to get started
  immich-grafana:
    container_name: immich_grafana
-    command: ['./run.sh', '-disable-reporting']
+    command: [ './run.sh', '-disable-reporting' ]
    ports:
      - 3000:3000
    image: grafana/grafana:11.5.2-ubuntu@sha256:8b5858c447e06fd7a89006b562ba7bba7c4d5813600c7982374c41852adefaeb
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@ -33,12 +33,12 @@ services:

  immich-machine-learning:
    container_name: immich_machine_learning
-    # For hardware acceleration, add one of -[armnn, cuda, openvino] to the image tag.
+    # For hardware acceleration, add one of -[armnn, cuda, openvino, rknn] to the image tag.
    # Example tag: ${IMMICH_VERSION:-release}-cuda
    image: ghcr.io/immich-app/immich-machine-learning:${IMMICH_VERSION:-release}
    # extends: # uncomment this section for hardware acceleration - see https://immich.app/docs/features/ml-hardware-acceleration
    #   file: hwaccel.ml.yml
-    #   service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable
+    #   service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl, rknn] for accelerated inference - use the `-wsl` version for WSL2 where applicable
    volumes:
      - model-cache:/cache
    env_file:
@ -67,22 +67,12 @@ services:
      - ${DB_DATA_LOCATION}:/var/lib/postgresql/data
    healthcheck:
      test: >-
-        pg_isready --dbname="$${POSTGRES_DB}" --username="$${POSTGRES_USER}" || exit 1;
-        Chksum="$$(psql --dbname="$${POSTGRES_DB}" --username="$${POSTGRES_USER}" --tuples-only --no-align
-        --command='SELECT COALESCE(SUM(checksum_failures), 0) FROM pg_stat_database')";
-        echo "checksum failure count is $$Chksum";
-        [ "$$Chksum" = '0' ] || exit 1
+        pg_isready --dbname="$${POSTGRES_DB}" --username="$${POSTGRES_USER}" || exit 1; Chksum="$$(psql --dbname="$${POSTGRES_DB}" --username="$${POSTGRES_USER}" --tuples-only --no-align --command='SELECT COALESCE(SUM(checksum_failures), 0) FROM pg_stat_database')"; echo "checksum failure count is $$Chksum"; [ "$$Chksum" = '0' ] || exit 1
      interval: 5m
      start_interval: 30s
      start_period: 5m
    command: >-
-      postgres
-      -c shared_preload_libraries=vectors.so
-      -c 'search_path="$$user", public, vectors'
-      -c logging_collector=on
-      -c max_wal_size=2GB
-      -c shared_buffers=512MB
-      -c wal_compression=on
+      postgres -c shared_preload_libraries=vectors.so -c 'search_path="$$user", public, vectors' -c logging_collector=on -c max_wal_size=2GB -c shared_buffers=512MB -c wal_compression=on
    restart: always

 volumes:
--- a/docker/hwaccel.ml.yml
+++ b/docker/hwaccel.ml.yml
@ -13,6 +13,13 @@ services:
    volumes:
      - /lib/firmware/mali_csffw.bin:/lib/firmware/mali_csffw.bin:ro # Mali firmware for your chipset (not always required depending on the driver)
      - /usr/lib/libmali.so:/usr/lib/libmali.so:ro # Mali driver for your chipset (always required)
+  
+  rknn:
+    security_opt:
+      - systempaths=unconfined
+      - apparmor=unconfined
+    devices:
+      - /dev/dri:/dev/dri

  cpu: {}

--- a/docs/docs/features/ml-hardware-acceleration.md
+++ b/docs/docs/features/ml-hardware-acceleration.md
@ -12,6 +12,7 @@ You do not need to redo any machine learning jobs after enabling hardware accele
 - ARM NN (Mali)
 - CUDA (NVIDIA GPUs with [compute capability](https://developer.nvidia.com/cuda-gpus) 5.2 or higher)
 - OpenVINO (Intel GPUs such as Iris Xe and Arc)
+- RKNN (Rockchip)

 ## Limitations

@ -19,6 +20,7 @@ You do not need to redo any machine learning jobs after enabling hardware accele
 - Only Linux and Windows (through WSL2) servers are supported.
 - ARM NN is only supported on devices with Mali GPUs. Other Arm devices are not supported.
 - Some models may not be compatible with certain backends. CUDA is the most reliable.
+- Search latency isn't improved by ARM NN due to model compatibility issues preventing its use. However, smart search jobs do make use of ARM NN.

 ## Prerequisites

@ -33,6 +35,7 @@ You do not need to redo any machine learning jobs after enabling hardware accele
  - The `hwaccel.ml.yml` file assumes the path to it is `/usr/lib/libmali.so`, so update accordingly if it is elsewhere
  - The `hwaccel.ml.yml` file assumes an additional file `/lib/firmware/mali_csffw.bin`, so update accordingly if your device's driver does not require this file
 - Optional: Configure your `.env` file, see [environment variables](/docs/install/environment-variables) for ARM NN specific settings
+  - In particular, the `MACHINE_LEARNING_ANN_FP16_TURBO` can significantly improve performance at the cost of very slightly lower accuracy

 #### CUDA

@ -47,6 +50,16 @@ You do not need to redo any machine learning jobs after enabling hardware accele
 - Ensure the server's kernel version is new enough to use the device for hardware accceleration.
 - Expect higher RAM usage when using OpenVINO compared to CPU processing.

+#### RKNN
+
+- You must have a supported Rockchip SoC: only RK3566, RK3568, RK3576 and RK3588 are supported at this moment.
+- Make sure you have the appropriate linux kernel driver installed
+  - This is usually pre-installed on the device vendor's Linux images
+- RKNPU driver V0.9.8 or later must be available in the host server
+  - You may confirm this by running `cat /sys/kernel/debug/rknpu/version` to check the version
+- Optional: Configure your `.env` file, see [environment variables](/docs/install/environment-variables) for RKNN specific settings
+  - In particular, setting `MACHINE_LEARNING_RKNN_THREADS` to 2 or 3 can _dramatically_ improve performance for RK3576 and RK3588 compared to the default of 1, at the expense of multiplying the amount of RAM each model uses by that amount.
+
 ## Setup

 1. If you do not already have it, download the latest [`hwaccel.ml.yml`][hw-file] file and ensure it's in the same folder as the `docker-compose.yml`.
@ -127,3 +140,12 @@ Note that you should increase job concurrencies to increase overall utilization
 - If you encounter an error when a model is running, try a different model to see if the issue is model-specific.
 - You may want to increase concurrency past the default for higher utilization. However, keep in mind that this will also increase VRAM consumption.
 - Larger models benefit more from hardware acceleration, if you have the VRAM for them.
+- Compared to ARM NN, RKNPU has:
+  - Wider model support (including for search, which ARM NN does not accelerate)
+  - Less heat generation
+  - Very slightly lower accuracy (RKNPU always uses FP16, while ARM NN by default uses higher precision FP32 unless `MACHINE_LEARNING_ANN_FP16_TURBO` is enabled)
+  - Varying speed (tested on RK3588):
+    - If `MACHINE_LEARNING_RKNN_THREADS` is at the default of 1, RKNPU will have substantially lower throughput for ML jobs than ARM NN in most cases, but similar latency (such as when searching)
+    - If `MACHINE_LEARNING_RKNN_THREADS` is set to 3, it will be somewhat faster than ARM NN at FP32, but somewhat slower than ARM NN if `MACHINE_LEARNING_ANN_FP16_TURBO` is enabled
+    - When other tasks also use the GPU (like transcoding), RKNPU has a significant advantage over ARM NN as it uses the otherwise idle NPU instead of competing for GPU usage
+  - Lower RAM usage if `MACHINE_LEARNING_RKNN_THREADS` is at the default of 1, but significantly higher if greater than 1 (which is necessary for it to fully utilize the NPU and hence be comparable in speed to ARM NN)
--- a/docs/docs/install/environment-variables.md
+++ b/docs/docs/install/environment-variables.md
@ -170,6 +170,8 @@ Redis (Sentinel) URL example JSON before encoding:
 | `MACHINE_LEARNING_MAX_BATCH_SIZE__FACIAL_RECOGNITION`       | Set the maximum number of faces that will be processed at once by the facial recognition model      |  None (`1` if using OpenVINO)   | machine learning |
 | `MACHINE_LEARNING_PING_TIMEOUT`                             | How long (ms) to wait for a PING response when checking if an ML server is available                |             `2000`              | server           |
 | `MACHINE_LEARNING_AVAILABILITY_BACKOFF_TIME`                | How long to ignore ML servers that are offline before trying again                                  |             `30000`             | server           |
+| `MACHINE_LEARNING_RKNN`                                     | Enable RKNN hardware acceleration if supported                                                      |             `True`              | machine learning |
+| `MACHINE_LEARNING_RKNN_THREADS`                             | How many threads of RKNN runtime should be spinned up while inferencing.                            |               `1`               | machine learning |

 \*1: It is recommended to begin with this parameter when changing the concurrency levels of the machine learning service and then tune the other ones.

--- a/machine-learning/.gitignore
+++ b/machine-learning/.gitignore
@ -1,5 +1,24 @@
 *.zip
 *.onnx
+*.rknn
+*.npy
+*_attr__value
+*.weight
+*.bias
+onnx__*
+*in_proj_bias
+*.proj
+*.latent
+*.pos_embed
+vocab.txt
+export/immich_model_exporter/models/**/README.md
+tokenizer.json
+tokenizer_config.json
+special_tokens_map.json
+preprocess_cfg.json
+config.json
+merges.txt
+vocab.json
 upload/
 venv/
 __pycache__/
--- a/machine-learning/Dockerfile
+++ b/machine-learning/Dockerfile
@ -15,6 +15,8 @@ RUN mkdir /opt/armnn && \
    cd /opt/ann && \
    sh build.sh

+FROM builder-cpu AS builder-rknn
+
 FROM builder-${DEVICE} AS builder

 ARG DEVICE
@ -77,6 +79,10 @@ COPY --from=builder-armnn \
    /opt/ann/build.sh \
    /opt/armnn/

+FROM prod-cpu AS prod-rknn
+
+ADD --checksum=sha256:73993ed4b440460825f21611731564503cc1d5a0c123746477da6cd574f34885 https://github.com/airockchip/rknn-toolkit2/raw/refs/tags/v2.3.0/rknpu2/runtime/Linux/librknn_api/aarch64/librknnrt.so /usr/lib/
+
 FROM prod-${DEVICE} AS prod
 ARG DEVICE

@ -123,4 +129,4 @@ ENV IMMICH_SOURCE_URL=https://github.com/immich-app/immich/commit/${BUILD_SOURCE
 ENTRYPOINT ["tini", "--"]
 CMD ["./start.sh"]

-HEALTHCHECK CMD python3 healthcheck.py
+HEALTHCHECK CMD python3 healthcheck.py
--- a/machine-learning/app/config.py
+++ b/machine-learning/app/config.py
@ -64,6 +64,8 @@ class Settings(BaseSettings):
    ann: bool = True
    ann_fp16_turbo: bool = False
    ann_tuning_level: int = 2
+    rknn: bool = True
+    rknn_threads: int = 1
    preload: PreloadModelData | None = None
    max_batch_size: MaxBatchSize | None = None

--- a/machine-learning/app/conftest.py
+++ b/machine-learning/app/conftest.py
@ -136,6 +136,12 @@ def ann_session() -> Iterator[mock.Mock]:
        yield mocked


+@pytest.fixture(scope="function")
+def rknn_session() -> Iterator[mock.Mock]:
+    with mock.patch("app.sessions.rknn.RknnPoolExecutor") as mocked:
+        yield mocked
+
+
@pytest.fixture(scope="function")
 def rmtree() -> Iterator[mock.Mock]:
    with mock.patch("app.models.base.rmtree", autospec=True) as mocked:
--- a/machine-learning/app/main.py
+++ b/machine-learning/app/main.py
@ -226,9 +226,9 @@ async def load(model: InferenceModel) -> InferenceModel:
            except FileNotFoundError as e:
                if model.model_format == ModelFormat.ONNX:
                    raise e
-                log.exception(e)
                log.warning(
-                    f"{model.model_format.upper()} is available, but model '{model.model_name}' does not support it."
+                    f"{model.model_format.upper()} is available, but model '{model.model_name}' does not support it.",
+                    exc_info=e,
                )
                model.model_format = ModelFormat.ONNX
                model.load()
--- a/machine-learning/app/models/base.py
+++ b/machine-learning/app/models/base.py
@ -8,6 +8,7 @@ from typing import Any, ClassVar
 from huggingface_hub import snapshot_download

 import ann.ann
+import app.sessions.rknn as rknn
 from app.sessions.ort import OrtSession

 from ..config import clean_name, log, settings
@ -66,12 +67,17 @@ class InferenceModel(ABC):
        pass

    def _download(self) -> None:
-        ignore_patterns = [] if self.model_format == ModelFormat.ARMNN else ["*.armnn"]
+        ignored_patterns: dict[ModelFormat, list[str]] = {
+            ModelFormat.ONNX: ["*.armnn", "*.rknn"],
+            ModelFormat.ARMNN: ["*.rknn"],
+            ModelFormat.RKNN: ["*.armnn"],
+        }
+
        snapshot_download(
            f"immich-app/{clean_name(self.model_name)}",
            cache_dir=self.cache_dir,
            local_dir=self.cache_dir,
-            ignore_patterns=ignore_patterns,
+            ignore_patterns=ignored_patterns.get(self.model_format, []),
        )

    def _load(self) -> ModelSession:
@ -108,17 +114,25 @@ class InferenceModel(ABC):
                session: ModelSession = AnnSession(model_path)
            case ".onnx":
                session = OrtSession(model_path)
+            case ".rknn":
+                session = rknn.RknnSession(model_path)
            case _:
                raise ValueError(f"Unsupported model file type: {model_path.suffix}")
        return session

+    def model_path_for_format(self, model_format: ModelFormat) -> Path:
+        model_path_prefix = rknn.model_prefix if model_format == ModelFormat.RKNN else None
+        if model_path_prefix:
+            return self.model_dir / model_path_prefix / f"model.{model_format}"
+        return self.model_dir / f"model.{model_format}"
+
    @property
    def model_dir(self) -> Path:
        return self.cache_dir / self.model_type.value

    @property
    def model_path(self) -> Path:
-        return self.model_dir / f"model.{self.model_format}"
+        return self.model_path_for_format(self.model_format)

    @property
    def model_task(self) -> ModelTask:
@ -155,4 +169,9 @@ class InferenceModel(ABC):

    @property
    def _model_format_default(self) -> ModelFormat:
-        return ModelFormat.ARMNN if ann.ann.is_available and settings.ann else ModelFormat.ONNX
+        if rknn.is_available:
+            return ModelFormat.RKNN
+        elif ann.ann.is_available and settings.ann:
+            return ModelFormat.ARMNN
+        else:
+            return ModelFormat.ONNX
--- a/machine-learning/app/models/constants.py
+++ b/machine-learning/app/models/constants.py
@ -44,6 +44,18 @@ _OPENCLIP_MODELS = {
    "nllb-clip-base-siglip__v1",
    "nllb-clip-large-siglip__mrl",
    "nllb-clip-large-siglip__v1",
+    "ViT-B-16-SigLIP2__webli",
+    "ViT-B-32-SigLIP2-256__webli",
+    "ViT-L-16-SigLIP2-256__webli",
+    "ViT-L-16-SigLIP2-384__webli",
+    "ViT-L-16-SigLIP2-512__webli",
+    "ViT-SO400M-14-SigLIP2-378__webli",
+    "ViT-SO400M-14-SigLIP2__webli",
+    "ViT-SO400M-16-SigLIP2-256__webli",
+    "ViT-SO400M-16-SigLIP2-384__webli",
+    "ViT-SO400M-16-SigLIP2-512__webli",
+    "ViT-gopt-16-SigLIP2-256__webli",
+    "ViT-gopt-16-SigLIP2-384__webli",
 }


@ -65,6 +77,9 @@ _INSIGHTFACE_MODELS = {

 SUPPORTED_PROVIDERS = ["CUDAExecutionProvider", "OpenVINOExecutionProvider", "CPUExecutionProvider"]

+RKNN_SUPPORTED_SOCS = ["rk3566", "rk3568", "rk3576", "rk3588"]
+RKNN_COREMASK_SUPPORTED_SOCS = ["rk3576", "rk3588"]
+

 def get_model_source(model_name: str) -> ModelSource | None:
    cleaned_name = clean_name(model_name)
--- a/machine-learning/app/models/facial_recognition/recognition.py
+++ b/machine-learning/app/models/facial_recognition/recognition.py
@ -31,7 +31,7 @@ class FaceRecognizer(InferenceModel):
            self._add_batch_axis(self.model_path)
            session = self._make_session(self.model_path)
        self.model = ArcFaceONNX(
-            self.model_path.with_suffix(".onnx").as_posix(),
+            self.model_path_for_format(ModelFormat.ONNX).as_posix(),
            session=session,
        )
        return session
--- a/machine-learning/app/schemas.py
+++ b/machine-learning/app/schemas.py
@ -35,6 +35,7 @@ class ModelType(StrEnum):
 class ModelFormat(StrEnum):
    ARMNN = "armnn"
    ONNX = "onnx"
+    RKNN = "rknn"


 class ModelSource(StrEnum):
--- a/machine-learning/app/sessions/rknn/init.py
+++ b/machine-learning/app/sessions/rknn/init.py
@ -0,0 +1,76 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, NamedTuple
+
+import numpy as np
+from numpy.typing import NDArray
+
+from app.config import log, settings
+from app.schemas import SessionNode
+
+from .rknnpool import RknnPoolExecutor, is_available, soc_name
+
+is_available = is_available and settings.rknn
+model_prefix = Path("rknpu") / soc_name if is_available and soc_name is not None else None
+
+
+def run_inference(rknn_lite: Any, input: list[NDArray[np.float32]]) -> list[NDArray[np.float32]]:
+    outputs: list[NDArray[np.float32]] = rknn_lite.inference(inputs=input, data_format="nchw")
+    return outputs
+
+
+input_output_mapping: dict[str, dict[str, Any]] = {
+    "detection": {
+        "input": {"norm_tensor:0": (1, 3, 640, 640)},
+        "output": {
+            "norm_tensor:1": (12800, 1),
+            "norm_tensor:2": (3200, 1),
+            "norm_tensor:3": (800, 1),
+            "norm_tensor:4": (12800, 4),
+            "norm_tensor:5": (3200, 4),
+            "norm_tensor:6": (800, 4),
+            "norm_tensor:7": (12800, 10),
+            "norm_tensor:8": (3200, 10),
+            "norm_tensor:9": (800, 10),
+        },
+    },
+    "recognition": {"input": {"norm_tensor:0": (1, 3, 112, 112)}, "output": {"norm_tensor:1": (1, 512)}},
+}
+
+
+class RknnSession:
+    def __init__(self, model_path: Path) -> None:
+        self.model_type = "detection" if "detection" in model_path.parts else "recognition"
+        self.tpe = settings.rknn_threads
+
+        log.info(f"Loading RKNN model from {model_path} with {self.tpe} threads.")
+        self.rknnpool = RknnPoolExecutor(model_path=model_path.as_posix(), tpes=self.tpe, func=run_inference)
+        log.info(f"Loaded RKNN model from {model_path} with {self.tpe} threads.")
+
+    def get_inputs(self) -> list[SessionNode]:
+        return [RknnNode(name=k, shape=v) for k, v in input_output_mapping[self.model_type]["input"].items()]
+
+    def get_outputs(self) -> list[SessionNode]:
+        return [RknnNode(name=k, shape=v) for k, v in input_output_mapping[self.model_type]["output"].items()]
+
+    def run(
+        self,
+        output_names: list[str] | None,
+        input_feed: dict[str, NDArray[np.float32]] | dict[str, NDArray[np.int32]],
+        run_options: Any = None,
+    ) -> list[NDArray[np.float32]]:
+        input_data: list[NDArray[np.float32]] = [np.ascontiguousarray(v) for v in input_feed.values()]
+        self.rknnpool.put(input_data)
+        res = self.rknnpool.get()
+        if res is None:
+            raise RuntimeError("RKNN inference failed!")
+        return res
+
+
+class RknnNode(NamedTuple):
+    name: str | None
+    shape: tuple[int, ...]
+
+
+__all__ = ["RknnSession", "RknnNode", "is_available", "soc_name", "model_prefix"]
--- a/machine-learning/app/sessions/rknn/rknnpool.py
+++ b/machine-learning/app/sessions/rknn/rknnpool.py
@ -0,0 +1,91 @@
+# This code is from leafqycc/rknn-multi-threaded
+# Following Apache License 2.0
+
+import logging
+from concurrent.futures import Future, ThreadPoolExecutor
+from pathlib import Path
+from queue import Queue
+from typing import Callable
+
+import numpy as np
+from numpy.typing import NDArray
+
+from app.config import log
+from app.models.constants import RKNN_COREMASK_SUPPORTED_SOCS, RKNN_SUPPORTED_SOCS
+
+
+def get_soc(device_tree_path: Path | str) -> str | None:
+    try:
+        with Path(device_tree_path).open() as f:
+            device_compatible_str = f.read()
+            for soc in RKNN_SUPPORTED_SOCS:
+                if soc in device_compatible_str:
+                    return soc
+            log.warning("Device is not supported for RKNN")
+    except OSError as e:
+        log.warning(f"Could not read {device_tree_path}. Reason: %s", e)
+    return None
+
+
+soc_name = None
+is_available = False
+try:
+    from rknnlite.api import RKNNLite
+
+    soc_name = get_soc("/proc/device-tree/compatible")
+    is_available = soc_name is not None
+except ImportError:
+    log.debug("RKNN is not available")
+
+
+def init_rknn(model_path: str) -> "RKNNLite":
+    if not is_available:
+        raise RuntimeError("rknn is not available!")
+    rknn_lite = RKNNLite()
+    rknn_lite.rknn_log.logger.setLevel(logging.ERROR)
+    ret = rknn_lite.load_rknn(model_path)
+    if ret != 0:
+        raise RuntimeError("Failed to load RKNN model")
+
+    if soc_name in RKNN_COREMASK_SUPPORTED_SOCS:
+        ret = rknn_lite.init_runtime(core_mask=RKNNLite.NPU_CORE_AUTO)
+    else:
+        ret = rknn_lite.init_runtime()  # Please do not set this parameter on other platforms.
+
+    if ret != 0:
+        raise RuntimeError("Failed to inititalize RKNN runtime environment")
+
+    return rknn_lite
+
+
+class RknnPoolExecutor:
+    def __init__(
+        self,
+        model_path: str,
+        tpes: int,
+        func: Callable[["RKNNLite", list[NDArray[np.float32]]], list[NDArray[np.float32]]],
+    ) -> None:
+        self.tpes = tpes
+        self.queue: Queue[Future[list[NDArray[np.float32]]]] = Queue()
+        self.rknn_pool = [init_rknn(model_path) for _ in range(tpes)]
+        self.pool = ThreadPoolExecutor(max_workers=tpes)
+        self.func = func
+        self.num = 0
+
+    def put(self, inputs: list[NDArray[np.float32]]) -> None:
+        self.queue.put(self.pool.submit(self.func, self.rknn_pool[self.num % self.tpes], inputs))
+        self.num += 1
+
+    def get(self) -> list[NDArray[np.float32]] | None:
+        if self.queue.empty():
+            return None
+        fut = self.queue.get()
+        return fut.result()
+
+    def release(self) -> None:
+        self.pool.shutdown()
+        for rknn_lite in self.rknn_pool:
+            rknn_lite.release()
+
+    def __del__(self) -> None:
+        self.release()
--- a/machine-learning/app/test_main.py
+++ b/machine-learning/app/test_main.py
@ -25,6 +25,7 @@ from app.models.facial_recognition.detection import FaceDetector
 from app.models.facial_recognition.recognition import FaceRecognizer
 from app.sessions.ann import AnnSession
 from app.sessions.ort import OrtSession
+from app.sessions.rknn import RknnSession, run_inference

 from .config import Settings, settings
 from .models.base import InferenceModel
@ -69,6 +70,14 @@ class TestBase:

        assert encoder.model_format == ModelFormat.ARMNN

+    def test_sets_default_model_format_to_rknn_if_available(self, mocker: MockerFixture) -> None:
+        mocker.patch.object(settings, "rknn", True)
+        mocker.patch("app.sessions.rknn.is_available", True)
+
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
+
+        assert encoder.model_format == ModelFormat.RKNN
+
    def test_casts_cache_dir_string_to_path(self) -> None:
        cache_dir = "/test_cache"
        encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=cache_dir)
@ -125,7 +134,7 @@ class TestBase:
            "immich-app/ViT-B-32__openai",
            cache_dir=encoder.cache_dir,
            local_dir=encoder.cache_dir,
-            ignore_patterns=["*.armnn"],
+            ignore_patterns=["*.armnn", "*.rknn"],
        )

    def test_download_downloads_armnn_if_preferred_format(self, snapshot_download: mock.Mock) -> None:
@ -136,7 +145,18 @@ class TestBase:
            "immich-app/ViT-B-32__openai",
            cache_dir=encoder.cache_dir,
            local_dir=encoder.cache_dir,
-            ignore_patterns=[],
+            ignore_patterns=["*.rknn"],
+        )
+
+    def test_download_downloads_rknn_if_preferred_format(self, snapshot_download: mock.Mock) -> None:
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", model_format=ModelFormat.RKNN)
+        encoder.download()
+
+        snapshot_download.assert_called_once_with(
+            "immich-app/ViT-B-32__openai",
+            cache_dir=encoder.cache_dir,
+            local_dir=encoder.cache_dir,
+            ignore_patterns=["*.armnn"],
        )

    def test_throws_exception_if_model_path_does_not_exist(
@ -328,6 +348,33 @@ class TestAnnSession:
        np_spy.assert_has_calls([mock.call(input1), mock.call(input2)])


+class TestRknnSession:
+    def test_creates_rknn_session(self, rknn_session: mock.Mock, info: mock.Mock, mocker: MockerFixture) -> None:
+        model_path = mock.MagicMock(spec=Path)
+        tpe = 1
+        mocker.patch("app.sessions.rknn.soc_name", "rk3566")
+        mocker.patch("app.sessions.rknn.is_available", True)
+        RknnSession(model_path)
+
+        rknn_session.assert_called_once_with(model_path=model_path.as_posix(), tpes=tpe, func=run_inference)
+
+        info.assert_has_calls([mock.call(f"Loaded RKNN model from {model_path} with {tpe} threads.")])
+
+    def test_run_rknn(self, rknn_session: mock.Mock, mocker: MockerFixture) -> None:
+        rknn_session.return_value.load.return_value = 123
+        np_spy = mocker.spy(np, "ascontiguousarray")
+        mocker.patch("app.sessions.rknn.soc_name", "rk3566")
+        session = RknnSession(Path("ViT-B-32__openai"))
+        [input1, input2] = [np.random.rand(1, 3, 224, 224).astype(np.float32) for _ in range(2)]
+        input_feed = {"input.1": input1, "input.2": input2}
+
+        session.run(None, input_feed)
+
+        rknn_session.return_value.put.assert_called_once_with([input1, input2])
+        np_spy.call_count == 2
+        np_spy.assert_has_calls([mock.call(input1), mock.call(input2)])
+
+
 class TestCLIP:
    embedding = np.random.rand(512).astype(np.float32)
    cache_dir = Path("test_cache")
@ -829,9 +876,7 @@ class TestLoad:
        mock_model.clear_cache.assert_not_called()
        mock_model.load.assert_not_called()

-    async def test_falls_back_to_onnx_if_other_format_does_not_exist(
-        self, exception: mock.Mock, warning: mock.Mock
-    ) -> None:
+    async def test_falls_back_to_onnx_if_other_format_does_not_exist(self, warning: mock.Mock) -> None:
        mock_model = mock.Mock(spec=InferenceModel)
        mock_model.model_name = "test_model_name"
        mock_model.model_type = ModelType.VISUAL
@ -846,8 +891,9 @@ class TestLoad:

        mock_model.clear_cache.assert_not_called()
        assert mock_model.load.call_count == 2
-        exception.assert_called_once_with(error)
-        warning.assert_called_once_with("ARMNN is available, but model 'test_model_name' does not support it.")
+        warning.assert_called_once_with(
+            "ARMNN is available, but model 'test_model_name' does not support it.", exc_info=error
+        )
        mock_model.model_format = ModelFormat.ONNX


--- a/machine-learning/export/.python-version
+++ b/machine-learning/export/.python-version
@ -0,0 +1 @@
+3.12
--- a/machine-learning/export/Dockerfile
+++ b/machine-learning/export/Dockerfile
@ -1,20 +0,0 @@
-FROM mambaorg/micromamba:bookworm-slim@sha256:e3797091302382ea841498bc93a7b0a50f7c1448333d5e946d2d1608d0c5f43d AS builder
-
-ENV TRANSFORMERS_CACHE=/cache \
-  PYTHONDONTWRITEBYTECODE=1 \
-  PYTHONUNBUFFERED=1 \
-  PATH="/opt/venv/bin:$PATH" \
-  PYTHONPATH=/usr/src
-
-COPY --chown=$MAMBA_USER:$MAMBA_USER conda-lock.yml /tmp/conda-lock.yml
-RUN micromamba install -y -n base -f /tmp/conda-lock.yml && \
-    micromamba remove -y -n base cxx-compiler && \
-    micromamba clean --all --yes
-
-WORKDIR /usr/src/app
-
-COPY --chown=$MAMBA_USER:$MAMBA_USER start.sh .
-COPY --chown=$MAMBA_USER:$MAMBA_USER app .
-
-ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"]
-CMD ["./start.sh"]
--- a/machine-learning/export/models/init.py
+++ b/machine-learning/export/models/init.py
--- a/machine-learning/export/conda-lock.yml
+++ b/machine-learning/export/conda-lock.yml
--- a/machine-learning/export/env.dev.yaml
+++ b/machine-learning/export/env.dev.yaml
@ -1,15 +0,0 @@
-name: base
-channels:
-  - conda-forge
-platforms:
-  - linux-64
-  - linux-aarch64
-dependencies:
-  - black
-  - conda-lock
-  - mypy
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - ruff
-category: dev
--- a/machine-learning/export/env.yaml
+++ b/machine-learning/export/env.yaml
@ -1,25 +0,0 @@
-name: base
-channels:
-  - conda-forge
-  - nvidia
-  - pytorch
-platforms:
-  - linux-64
-dependencies:
-  - cxx-compiler
-  - onnx==1.*
-  - onnxruntime==1.*
-  - open-clip-torch==2.*
-  - orjson==3.*
-  - pip
-  - python==3.11.*
-  - pytorch>=2.3
-  - rich==13.*
-  - safetensors==0.*
-  - setuptools==68.*
-  - torchvision
-  - transformers==4.*
-  - pip:
-    - multilingual-clip
-    - onnxsim
-category: main
--- a/machine-learning/export/immich_model_exporter/init.py
+++ b/machine-learning/export/immich_model_exporter/init.py
--- a/machine-learning/export/immich_model_exporter/export.py
+++ b/machine-learning/export/immich_model_exporter/export.py
@ -0,0 +1,98 @@
+from pathlib import Path
+
+import typer
+from tenacity import retry, stop_after_attempt, wait_fixed
+from typing_extensions import Annotated
+
+from .exporters.constants import DELETE_PATTERNS, SOURCE_TO_METADATA, ModelSource
+from .exporters.onnx import export as onnx_export
+from .exporters.rknn import export as rknn_export
+
+app = typer.Typer(pretty_exceptions_show_locals=False)
+
+
+def generate_readme(model_name: str, model_source: ModelSource) -> str:
+    (name, link, type) = SOURCE_TO_METADATA[model_source]
+    match model_source:
+        case ModelSource.MCLIP:
+            tags = ["immich", "clip", "multilingual"]
+        case ModelSource.OPENCLIP:
+            tags = ["immich", "clip"]
+            lowered = model_name.lower()
+            if "xlm" in lowered or "nllb" in lowered:
+                tags.append("multilingual")
+        case ModelSource.INSIGHTFACE:
+            tags = ["immich", "facial-recognition"]
+        case _:
+            raise ValueError(f"Unsupported model source {model_source}")
+
+    return f"""---
+tags:
+{" - " + "\n - ".join(tags)}
+---
+# Model Description
+
+This repo contains ONNX exports for the associated {type} model by {name}. See the [{name}]({link}) repo for more info.
+
+This repo is specifically intended for use with [Immich](https://immich.app/), a self-hosted photo library.
+"""
+
+
+@app.command()
+def main(
+    model_name: str,
+    model_source: ModelSource,
+    output_dir: Path = Path("./models"),
+    no_cache: bool = False,
+    hf_organization: str = "immich-app",
+    hf_auth_token: Annotated[str | None, typer.Option(envvar="HF_AUTH_TOKEN")] = None,
+) -> None:
+    hf_model_name = model_name.split("/")[-1]
+    hf_model_name = hf_model_name.replace("xlm-roberta-large", "XLM-Roberta-Large")
+    hf_model_name = hf_model_name.replace("xlm-roberta-base", "XLM-Roberta-Base")
+    output_dir = output_dir / hf_model_name
+    match model_source:
+        case ModelSource.MCLIP | ModelSource.OPENCLIP:
+            output_dir.mkdir(parents=True, exist_ok=True)
+            onnx_export(model_name, model_source, output_dir, no_cache=no_cache)
+        case ModelSource.INSIGHTFACE:
+            from huggingface_hub import snapshot_download
+
+            # TODO: start from insightface dump instead of downloading from HF
+            snapshot_download(f"immich-app/{hf_model_name}", local_dir=output_dir)
+        case _:
+            raise ValueError(f"Unsupported model source {model_source}")
+
+    try:
+        rknn_export(output_dir, no_cache=no_cache)
+    except Exception as e:
+        print(f"Failed to export model {model_name} to rknn: {e}")
+        (output_dir / "rknpu").unlink(missing_ok=True)
+
+    readme_path = output_dir / "README.md"
+    if no_cache or not readme_path.exists():
+        with open(readme_path, "w") as f:
+            f.write(generate_readme(model_name, model_source))
+
+    if hf_auth_token is not None:
+        from huggingface_hub import create_repo, upload_folder
+
+        repo_id = f"{hf_organization}/{hf_model_name}"
+
+        @retry(stop=stop_after_attempt(5), wait=wait_fixed(5))
+        def upload_model() -> None:
+            create_repo(repo_id, exist_ok=True, token=hf_auth_token)
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=output_dir,
+                # remote repo files to be deleted before uploading
+                # deletion is in the same commit as the upload, so it's atomic
+                delete_patterns=DELETE_PATTERNS,
+                token=hf_auth_token,
+            )
+
+        upload_model()
+
+
+if __name__ == "__main__":
+    typer.run(main)
--- a/machine-learning/export/immich_model_exporter/exporters/constants.py
+++ b/machine-learning/export/immich_model_exporter/exporters/constants.py
@ -0,0 +1,42 @@
+from enum import StrEnum
+from typing import NamedTuple
+
+
+class ModelSource(StrEnum):
+    INSIGHTFACE = "insightface"
+    MCLIP = "mclip"
+    OPENCLIP = "openclip"
+
+
+class SourceMetadata(NamedTuple):
+    name: str
+    link: str
+    type: str
+
+
+SOURCE_TO_METADATA = {
+    ModelSource.MCLIP: SourceMetadata("M-CLIP", "https://huggingface.co/M-CLIP", "CLIP"),
+    ModelSource.OPENCLIP: SourceMetadata("OpenCLIP", "https://github.com/mlfoundations/open_clip", "CLIP"),
+    ModelSource.INSIGHTFACE: SourceMetadata(
+        "InsightFace", "https://github.com/deepinsight/insightface/tree/master", "facial recognition"
+    ),
+}
+
+RKNN_SOCS = ["rk3566", "rk3568", "rk3576", "rk3588"]
+
+
+# glob to delete old UUID blobs when reuploading models
+_uuid_char = "[a-fA-F0-9]"
+_uuid_glob = _uuid_char * 8 + "-" + _uuid_char * 4 + "-" + _uuid_char * 4 + "-" + _uuid_char * 4 + "-" + _uuid_char * 12
+DELETE_PATTERNS = [
+    "**/*onnx*",
+    "**/Constant*",
+    "**/*.weight",
+    "**/*.bias",
+    "**/*.proj",
+    "**/*in_proj_bias",
+    "**/*.npy",
+    "**/*.latent",
+    "**/*.pos_embed",
+    f"**/{_uuid_glob}",
+]
--- a/machine-learning/export/immich_model_exporter/exporters/onnx/init.py
+++ b/machine-learning/export/immich_model_exporter/exporters/onnx/init.py
@ -0,0 +1,20 @@
+from pathlib import Path
+
+from ..constants import ModelSource
+from .models import mclip, openclip
+
+
+def export(
+    model_name: str, model_source: ModelSource, output_dir: Path, opset_version: int = 19, no_cache: bool = False
+) -> None:
+    visual_dir = output_dir / "visual"
+    textual_dir = output_dir / "textual"
+    match model_source:
+        case ModelSource.MCLIP:
+            mclip.to_onnx(model_name, opset_version, visual_dir, textual_dir, no_cache=no_cache)
+        case ModelSource.OPENCLIP:
+            name, _, pretrained = model_name.partition("__")
+            config = openclip.OpenCLIPModelConfig(name, pretrained)
+            openclip.to_onnx(config, opset_version, visual_dir, textual_dir, no_cache=no_cache)
+        case _:
+            raise ValueError(f"Unsupported model source {model_source}")
--- a/machine-learning/export/immich_model_exporter/exporters/onnx/models/init.py
+++ b/machine-learning/export/immich_model_exporter/exporters/onnx/models/init.py
--- a/machine-learning/export/immich_model_exporter/exporters/onnx/models/mclip.py
+++ b/machine-learning/export/immich_model_exporter/exporters/onnx/models/mclip.py
@ -1,11 +1,6 @@
-import os
-import tempfile
 import warnings
 from pathlib import Path
-
-import torch
-from multilingual_clip.pt_multilingual_clip import MultilingualCLIP
-from transformers import AutoTokenizer
+from typing import Any

 from .openclip import OpenCLIPModelConfig
 from .openclip import to_onnx as openclip_to_onnx
@ -21,25 +16,40 @@ _MCLIP_TO_OPENCLIP = {

 def to_onnx(
    model_name: str,
+    opset_version: int,
    output_dir_visual: Path | str,
    output_dir_textual: Path | str,
+    no_cache: bool = False,
 ) -> tuple[Path, Path]:
    textual_path = get_model_path(output_dir_textual)
-    with tempfile.TemporaryDirectory() as tmpdir:
-        model = MultilingualCLIP.from_pretrained(model_name, cache_dir=os.environ.get("CACHE_DIR", tmpdir))
+    if no_cache or not textual_path.exists():
+        import torch
+        from multilingual_clip.pt_multilingual_clip import MultilingualCLIP
+        from transformers import AutoTokenizer
+
+        torch.backends.mha.set_fastpath_enabled(False)
+
+        model = MultilingualCLIP.from_pretrained(model_name)
        AutoTokenizer.from_pretrained(model_name).save_pretrained(output_dir_textual)

        model.eval()
        for param in model.parameters():
            param.requires_grad_(False)

-        export_text_encoder(model, textual_path)
-        visual_path, _ = openclip_to_onnx(_MCLIP_TO_OPENCLIP[model_name], output_dir_visual)
-        assert visual_path is not None, "Visual model export failed"
+        _export_text_encoder(model, textual_path, opset_version)
+    else:
+        print(f"Model {textual_path} already exists, skipping")
+    visual_path, _ = openclip_to_onnx(
+        _MCLIP_TO_OPENCLIP[model_name], opset_version, output_dir_visual, no_cache=no_cache
+    )
+    assert visual_path is not None, "Visual model export failed"
    return visual_path, textual_path


-def export_text_encoder(model: MultilingualCLIP, output_path: Path | str) -> None:
+def _export_text_encoder(model: Any, output_path: Path | str, opset_version: int) -> None:
+    import torch
+    from multilingual_clip.pt_multilingual_clip import MultilingualCLIP
+
    output_path = Path(output_path)

    def forward(self: MultilingualCLIP, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
@ -61,7 +71,7 @@ def export_text_encoder(model: MultilingualCLIP, output_path: Path | str) -> Non
            output_path.as_posix(),
            input_names=["input_ids", "attention_mask"],
            output_names=["embedding"],
-            opset_version=17,
+            opset_version=opset_version,
            # dynamic_axes={
            #     "input_ids": {0: "batch_size", 1: "sequence_length"},
            #     "attention_mask": {0: "batch_size", 1: "sequence_length"},
--- a/machine-learning/export/immich_model_exporter/exporters/onnx/models/openclip.py
+++ b/machine-learning/export/immich_model_exporter/exporters/onnx/models/openclip.py
@ -0,0 +1,153 @@
+import warnings
+from dataclasses import dataclass
+from functools import cached_property
+from pathlib import Path
+from typing import Any
+
+from .util import get_model_path, save_config
+
+
+@dataclass
+class OpenCLIPModelConfig:
+    name: str
+    pretrained: str
+
+    @cached_property
+    def model_config(self) -> dict[str, Any]:
+        import open_clip
+
+        config: dict[str, Any] | None = open_clip.get_model_config(self.name)
+        if config is None:
+            raise ValueError(f"Unknown model {self.name}")
+        return config
+
+    @property
+    def image_size(self) -> int:
+        image_size: int = self.model_config["vision_cfg"]["image_size"]
+        return image_size
+
+    @property
+    def sequence_length(self) -> int:
+        context_length: int = self.model_config["text_cfg"].get("context_length", 77)
+        return context_length
+
+
+def to_onnx(
+    model_cfg: OpenCLIPModelConfig,
+    opset_version: int,
+    output_dir_visual: Path | str | None = None,
+    output_dir_textual: Path | str | None = None,
+    no_cache: bool = False,
+) -> tuple[Path | None, Path | None]:
+    visual_path = None
+    textual_path = None
+    if output_dir_visual is not None:
+        output_dir_visual = Path(output_dir_visual)
+        visual_path = get_model_path(output_dir_visual)
+
+    if output_dir_textual is not None:
+        output_dir_textual = Path(output_dir_textual)
+        textual_path = get_model_path(output_dir_textual)
+
+    if not no_cache and (
+        (textual_path is None or textual_path.exists()) and (visual_path is None or visual_path.exists())
+    ):
+        print(f"Models {textual_path} and {visual_path} already exist, skipping")
+        return visual_path, textual_path
+
+    import open_clip
+    import torch
+    from transformers import AutoTokenizer
+
+    torch.backends.mha.set_fastpath_enabled(False)
+
+    model = open_clip.create_model(
+        model_cfg.name,
+        pretrained=model_cfg.pretrained,
+        jit=False,
+        require_pretrained=True,
+    )
+
+    text_vision_cfg = open_clip.get_model_config(model_cfg.name)
+
+    model.eval()
+    for param in model.parameters():
+        param.requires_grad_(False)
+
+    if visual_path is not None and output_dir_visual is not None:
+        if no_cache or not visual_path.exists():
+            save_config(
+                open_clip.get_model_preprocess_cfg(model),
+                output_dir_visual / "preprocess_cfg.json",
+            )
+            save_config(text_vision_cfg, output_dir_visual.parent / "config.json")
+            _export_image_encoder(model, model_cfg, visual_path, opset_version)
+        else:
+            print(f"Model {visual_path} already exists, skipping")
+
+    if textual_path is not None and output_dir_textual is not None:
+        if no_cache or not textual_path.exists():
+            tokenizer_name = text_vision_cfg["text_cfg"].get("hf_tokenizer_name", "openai/clip-vit-base-patch32")
+            AutoTokenizer.from_pretrained(tokenizer_name).save_pretrained(output_dir_textual)
+            _export_text_encoder(model, model_cfg, textual_path, opset_version)
+        else:
+            print(f"Model {textual_path} already exists, skipping")
+    return visual_path, textual_path
+
+
+def _export_image_encoder(
+    model: Any, model_cfg: OpenCLIPModelConfig, output_path: Path | str, opset_version: int
+) -> None:
+    import torch
+
+    output_path = Path(output_path)
+
+    def encode_image(image: torch.Tensor) -> torch.Tensor:
+        output = model.encode_image(image, normalize=True)
+        assert isinstance(output, torch.Tensor)
+        return output
+
+    model.forward = encode_image
+
+    args = (torch.randn(1, 3, model_cfg.image_size, model_cfg.image_size),)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", UserWarning)
+        torch.onnx.export(
+            model,
+            args,
+            output_path.as_posix(),
+            input_names=["image"],
+            output_names=["embedding"],
+            opset_version=opset_version,
+            # dynamic_axes={"image": {0: "batch_size"}},
+        )
+
+
+def _export_text_encoder(
+    model: Any, model_cfg: OpenCLIPModelConfig, output_path: Path | str, opset_version: int
+) -> None:
+    import torch
+
+    output_path = Path(output_path)
+
+    def encode_text(text: torch.Tensor) -> torch.Tensor:
+        output = model.encode_text(text, normalize=True)
+        assert isinstance(output, torch.Tensor)
+        return output
+
+    model.forward = encode_text
+
+    args = (torch.ones(1, model_cfg.sequence_length, dtype=torch.int32),)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", UserWarning)
+        torch.onnx.export(
+            model,
+            args,
+            output_path.as_posix(),
+            input_names=["text"],
+            output_names=["embedding"],
+            opset_version=opset_version,
+            # dynamic_axes={"text": {0: "batch_size"}},
+        )
--- a/machine-learning/export/immich_model_exporter/exporters/onnx/models/util.py
+++ b/machine-learning/export/immich_model_exporter/exporters/onnx/models/util.py
--- a/machine-learning/export/immich_model_exporter/exporters/rknn.py
+++ b/machine-learning/export/immich_model_exporter/exporters/rknn.py
@ -0,0 +1,96 @@
+from pathlib import Path
+
+from .constants import RKNN_SOCS
+
+
+def _export_platform(
+    model_dir: Path,
+    target_platform: str,
+    inputs: list[str] | None = None,
+    input_size_list: list[list[int]] | None = None,
+    fuse_matmul_softmax_matmul_to_sdpa: bool = True,
+    no_cache: bool = False,
+) -> None:
+    from rknn.api import RKNN
+
+    input_path = model_dir / "model.onnx"
+    output_path = model_dir / "rknpu" / target_platform / "model.rknn"
+    if not no_cache and output_path.exists():
+        print(f"Model {input_path} already exists at {output_path}, skipping")
+        return
+
+    print(f"Exporting model {input_path} to {output_path}")
+
+    rknn = RKNN(verbose=False)
+
+    rknn.config(
+        target_platform=target_platform,
+        disable_rules=["fuse_matmul_softmax_matmul_to_sdpa"] if not fuse_matmul_softmax_matmul_to_sdpa else [],
+        enable_flash_attention=False,
+        model_pruning=True,
+    )
+    ret = rknn.load_onnx(model=input_path.as_posix(), inputs=inputs, input_size_list=input_size_list)
+
+    if ret != 0:
+        raise RuntimeError("Load failed!")
+
+    ret = rknn.build(do_quantization=False)
+
+    if ret != 0:
+        raise RuntimeError("Build failed!")
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    ret = rknn.export_rknn(output_path.as_posix())
+    if ret != 0:
+        raise RuntimeError("Export rknn model failed!")
+
+
+def _export_platforms(
+    model_dir: Path,
+    inputs: list[str] | None = None,
+    input_size_list: list[list[int]] | None = None,
+    no_cache: bool = False,
+) -> None:
+    fuse_matmul_softmax_matmul_to_sdpa = True
+    for soc in RKNN_SOCS:
+        try:
+            _export_platform(
+                model_dir,
+                soc,
+                inputs=inputs,
+                input_size_list=input_size_list,
+                fuse_matmul_softmax_matmul_to_sdpa=fuse_matmul_softmax_matmul_to_sdpa,
+                no_cache=no_cache,
+            )
+        except Exception as e:
+            print(f"Failed to export model for {soc}: {e}")
+            if "inputs or 'outputs' must be set" in str(e):
+                print("Retrying without fuse_matmul_softmax_matmul_to_sdpa")
+                fuse_matmul_softmax_matmul_to_sdpa = False
+                _export_platform(
+                    model_dir,
+                    soc,
+                    inputs=inputs,
+                    input_size_list=input_size_list,
+                    fuse_matmul_softmax_matmul_to_sdpa=fuse_matmul_softmax_matmul_to_sdpa,
+                    no_cache=no_cache,
+                )
+
+
+def export(model_dir: Path, no_cache: bool = False) -> None:
+    textual = model_dir / "textual"
+    visual = model_dir / "visual"
+    detection = model_dir / "detection"
+    recognition = model_dir / "recognition"
+
+    if textual.is_dir():
+        _export_platforms(textual, no_cache=no_cache)
+
+    if visual.is_dir():
+        _export_platforms(visual, no_cache=no_cache)
+
+    if detection.is_dir():
+        _export_platforms(detection, inputs=["input.1"], input_size_list=[[1, 3, 640, 640]], no_cache=no_cache)
+
+    if recognition.is_dir():
+        _export_platforms(recognition, inputs=["input.1"], input_size_list=[[1, 3, 112, 112]], no_cache=no_cache)
--- a/machine-learning/export/immich_model_exporter/run.py
+++ b/machine-learning/export/immich_model_exporter/run.py
@ -0,0 +1,88 @@
+import subprocess
+
+from exporters.constants import ModelSource
+
+mclip = [
+    "M-CLIP/LABSE-Vit-L-14",
+    "M-CLIP/XLM-Roberta-Large-Vit-B-16Plus",
+    "M-CLIP/XLM-Roberta-Large-Vit-B-32",
+    "M-CLIP/XLM-Roberta-Large-Vit-L-14",
+]
+
+openclip = [
+    "RN101__openai",
+    "RN101__yfcc15m",
+    "RN50__cc12m",
+    "RN50__openai",
+    "RN50__yfcc15m",
+    "RN50x16__openai",
+    "RN50x4__openai",
+    "RN50x64__openai",
+    "ViT-B-16-SigLIP-256__webli",
+    "ViT-B-16-SigLIP-384__webli",
+    "ViT-B-16-SigLIP-512__webli",
+    "ViT-B-16-SigLIP-i18n-256__webli",
+    "ViT-B-16-SigLIP2__webli",
+    "ViT-B-16-SigLIP__webli",
+    "ViT-B-16-plus-240__laion400m_e31",
+    "ViT-B-16-plus-240__laion400m_e32",
+    "ViT-B-16__laion400m_e31",
+    "ViT-B-16__laion400m_e32",
+    "ViT-B-16__openai",
+    "ViT-B-32-SigLIP2-256__webli",
+    "ViT-B-32__laion2b-s34b-b79k",
+    "ViT-B-32__laion2b_e16",
+    "ViT-B-32__laion400m_e31",
+    "ViT-B-32__laion400m_e32",
+    "ViT-B-32__openai",
+    "ViT-H-14-378-quickgelu__dfn5b",
+    "ViT-H-14-quickgelu__dfn5b",
+    "ViT-H-14__laion2b-s32b-b79k",
+    "ViT-L-14-336__openai",
+    "ViT-L-14-quickgelu__dfn2b",
+    "ViT-L-14__laion2b-s32b-b82k",
+    "ViT-L-14__laion400m_e31",
+    "ViT-L-14__laion400m_e32",
+    "ViT-L-14__openai",
+    "ViT-L-16-SigLIP-256__webli",
+    "ViT-L-16-SigLIP-384__webli",
+    "ViT-L-16-SigLIP2-256__webli",
+    "ViT-L-16-SigLIP2-384__webli",
+    "ViT-L-16-SigLIP2-512__webli",
+    "ViT-SO400M-14-SigLIP-384__webli",
+    "ViT-SO400M-14-SigLIP2-378__webli",
+    "ViT-SO400M-14-SigLIP2__webli",
+    "ViT-SO400M-16-SigLIP2-256__webli",
+    "ViT-SO400M-16-SigLIP2-384__webli",
+    "ViT-SO400M-16-SigLIP2-512__webli",
+    "ViT-gopt-16-SigLIP2-256__webli",
+    "ViT-gopt-16-SigLIP2-384__webli",
+    "nllb-clip-base-siglip__mrl",
+    "nllb-clip-base-siglip__v1",
+    "nllb-clip-large-siglip__mrl",
+    "nllb-clip-large-siglip__v1",
+    "xlm-roberta-base-ViT-B-32__laion5b_s13b_b90k",
+    "xlm-roberta-large-ViT-H-14__frozen_laion5b_s13b_b90k",
+]
+
+insightface = [
+    "antelopev2",
+    "buffalo_l",
+    "buffalo_m",
+    "buffalo_s",
+]
+
+
+def export_models(models: list[str], source: ModelSource) -> None:
+    for model in models:
+        try:
+            print(f"Exporting model {model}")
+            subprocess.check_call(["python", "-m", "immich_model_exporter.export", model, source])
+        except Exception as e:
+            print(f"Failed to export model {model}: {e}")
+
+
+if __name__ == "__main__":
+    export_models(mclip, ModelSource.MCLIP)
+    export_models(openclip, ModelSource.OPENCLIP)
+    export_models(insightface, ModelSource.INSIGHTFACE)
--- a/machine-learning/export/models/openclip.py
+++ b/machine-learning/export/models/openclip.py
@ -1,114 +0,0 @@
-import os
-import tempfile
-import warnings
-from dataclasses import dataclass, field
-from pathlib import Path
-
-import open_clip
-import torch
-from transformers import AutoTokenizer
-
-from .util import get_model_path, save_config
-
-
-@dataclass
-class OpenCLIPModelConfig:
-    name: str
-    pretrained: str
-    image_size: int = field(init=False)
-    sequence_length: int = field(init=False)
-
-    def __post_init__(self) -> None:
-        open_clip_cfg = open_clip.get_model_config(self.name)
-        if open_clip_cfg is None:
-            raise ValueError(f"Unknown model {self.name}")
-        self.image_size = open_clip_cfg["vision_cfg"]["image_size"]
-        self.sequence_length = open_clip_cfg["text_cfg"].get("context_length", 77)
-
-
-def to_onnx(
-    model_cfg: OpenCLIPModelConfig,
-    output_dir_visual: Path | str | None = None,
-    output_dir_textual: Path | str | None = None,
-) -> tuple[Path | None, Path | None]:
-    visual_path = None
-    textual_path = None
-    with tempfile.TemporaryDirectory() as tmpdir:
-        model = open_clip.create_model(
-            model_cfg.name,
-            pretrained=model_cfg.pretrained,
-            jit=False,
-            cache_dir=os.environ.get("CACHE_DIR", tmpdir),
-            require_pretrained=True,
-        )
-
-        text_vision_cfg = open_clip.get_model_config(model_cfg.name)
-
-        model.eval()
-        for param in model.parameters():
-            param.requires_grad_(False)
-
-        if output_dir_visual is not None:
-            output_dir_visual = Path(output_dir_visual)
-            visual_path = get_model_path(output_dir_visual)
-
-            save_config(open_clip.get_model_preprocess_cfg(model), output_dir_visual / "preprocess_cfg.json")
-            save_config(text_vision_cfg, output_dir_visual.parent / "config.json")
-            export_image_encoder(model, model_cfg, visual_path)
-
-        if output_dir_textual is not None:
-            output_dir_textual = Path(output_dir_textual)
-            textual_path = get_model_path(output_dir_textual)
-
-            tokenizer_name = text_vision_cfg["text_cfg"].get("hf_tokenizer_name", "openai/clip-vit-base-patch32")
-            AutoTokenizer.from_pretrained(tokenizer_name).save_pretrained(output_dir_textual)
-            export_text_encoder(model, model_cfg, textual_path)
-    return visual_path, textual_path
-
-
-def export_image_encoder(model: open_clip.CLIP, model_cfg: OpenCLIPModelConfig, output_path: Path | str) -> None:
-    output_path = Path(output_path)
-
-    def encode_image(image: torch.Tensor) -> torch.Tensor:
-        output = model.encode_image(image, normalize=True)
-        assert isinstance(output, torch.Tensor)
-        return output
-
-    args = (torch.randn(1, 3, model_cfg.image_size, model_cfg.image_size),)
-    traced = torch.jit.trace(encode_image, args)  # type: ignore[no-untyped-call]
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", UserWarning)
-        torch.onnx.export(
-            traced,
-            args,
-            output_path.as_posix(),
-            input_names=["image"],
-            output_names=["embedding"],
-            opset_version=17,
-            # dynamic_axes={"image": {0: "batch_size"}},
-        )
-
-
-def export_text_encoder(model: open_clip.CLIP, model_cfg: OpenCLIPModelConfig, output_path: Path | str) -> None:
-    output_path = Path(output_path)
-
-    def encode_text(text: torch.Tensor) -> torch.Tensor:
-        output = model.encode_text(text, normalize=True)
-        assert isinstance(output, torch.Tensor)
-        return output
-
-    args = (torch.ones(1, model_cfg.sequence_length, dtype=torch.int32),)
-    traced = torch.jit.trace(encode_text, args)  # type: ignore[no-untyped-call]
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", UserWarning)
-        torch.onnx.export(
-            traced,
-            args,
-            output_path.as_posix(),
-            input_names=["text"],
-            output_names=["embedding"],
-            opset_version=17,
-            # dynamic_axes={"text": {0: "batch_size"}},
-        )
--- a/machine-learning/export/models/optimize.py
+++ b/machine-learning/export/models/optimize.py
@ -1,49 +0,0 @@
-from pathlib import Path
-
-import onnx
-import onnxruntime as ort
-import onnxsim
-
-
-def save_onnx(model: onnx.ModelProto, output_path: Path | str) -> None:
-    try:
-        onnx.save(model, output_path)
-    except ValueError as e:
-        if "The proto size is larger than the 2 GB limit." in str(e):
-            onnx.save(model, output_path, save_as_external_data=True, size_threshold=1_000_000)
-        else:
-            raise e
-
-
-def optimize_onnxsim(model_path: Path | str, output_path: Path | str) -> None:
-    model_path = Path(model_path)
-    output_path = Path(output_path)
-    model = onnx.load(model_path.as_posix())
-    model, check = onnxsim.simplify(model)
-    assert check, "Simplified ONNX model could not be validated"
-    for file in model_path.parent.iterdir():
-        if file.name.startswith("Constant") or "onnx" in file.name or file.suffix == ".weight":
-            file.unlink()
-    save_onnx(model, output_path)
-
-
-def optimize_ort(
-    model_path: Path | str,
-    output_path: Path | str,
-    level: ort.GraphOptimizationLevel = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
-) -> None:
-    model_path = Path(model_path)
-    output_path = Path(output_path)
-
-    sess_options = ort.SessionOptions()
-    sess_options.graph_optimization_level = level
-    sess_options.optimized_model_filepath = output_path.as_posix()
-
-    ort.InferenceSession(model_path.as_posix(), providers=["CPUExecutionProvider"], sess_options=sess_options)
-
-
-def optimize(model_path: Path | str) -> None:
-    model_path = Path(model_path)
-
-    optimize_ort(model_path, model_path)
-    optimize_onnxsim(model_path, model_path)
--- a/machine-learning/export/pyproject.toml
+++ b/machine-learning/export/pyproject.toml
@ -0,0 +1,67 @@
+[project]
+name = "immich_model_exporter"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10, <4.0"
+dependencies = [
+  "huggingface-hub>=0.29.3",
+  "multilingual-clip>=1.0.10",
+  "onnx>=1.14.1",
+  "onnxruntime>=1.16.0",
+  "open-clip-torch>=2.31.0",
+  "typer>=0.15.2",
+  "rknn-toolkit2>=2.3.0",
+  "transformers>=4.49.0",
+  "tenacity>=9.0.0",
+]
+
+[dependency-groups]
+dev = ["black>=23.3.0", "mypy>=1.3.0", "ruff>=0.0.272"]
+
+[tool.uv]
+override-dependencies = [
+  "onnx>=1.16.0,<2",
+  "onnxruntime>=1.18.2,<2",
+  "torch>=2.4",
+  "torchvision>=0.21",
+]
+
+[tool.uv.sources]
+torch = [{ index = "pytorch-cpu" }]
+torchvision = [{ index = "pytorch-cpu" }]
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[tool.hatch.build.targets.sdist]
+include = ["immich_model_exporter"]
+
+[tool.hatch.build.targets.wheel]
+include = ["immich_model_exporter"]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.mypy]
+python_version = "3.12"
+follow_imports = "silent"
+warn_redundant_casts = true
+disallow_any_generics = true
+check_untyped_defs = true
+disallow_untyped_defs = true
+ignore_missing_imports = true
+
+[tool.ruff]
+line-length = 120
+target-version = "py312"
+
+[tool.ruff.lint]
+select = ["E", "F", "I"]
+
+[tool.black]
+line-length = 120
+target-version = ['py312']
--- a/machine-learning/export/run.py
+++ b/machine-learning/export/run.py
@ -1,113 +0,0 @@
-import gc
-import os
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-import torch
-from huggingface_hub import create_repo, upload_folder
-from models import mclip, openclip
-from models.optimize import optimize
-from rich.progress import Progress
-
-models = [
-    "M-CLIP/LABSE-Vit-L-14",
-    "M-CLIP/XLM-Roberta-Large-Vit-B-16Plus",
-    "M-CLIP/XLM-Roberta-Large-Vit-B-32",
-    "M-CLIP/XLM-Roberta-Large-Vit-L-14",
-    "RN101::openai",
-    "RN101::yfcc15m",
-    "RN50::cc12m",
-    "RN50::openai",
-    "RN50::yfcc15m",
-    "RN50x16::openai",
-    "RN50x4::openai",
-    "RN50x64::openai",
-    "ViT-B-16-SigLIP-256::webli",
-    "ViT-B-16-SigLIP-384::webli",
-    "ViT-B-16-SigLIP-512::webli",
-    "ViT-B-16-SigLIP-i18n-256::webli",
-    "ViT-B-16-SigLIP::webli",
-    "ViT-B-16-plus-240::laion400m_e31",
-    "ViT-B-16-plus-240::laion400m_e32",
-    "ViT-B-16::laion400m_e31",
-    "ViT-B-16::laion400m_e32",
-    "ViT-B-16::openai",
-    "ViT-B-32::laion2b-s34b-b79k",
-    "ViT-B-32::laion2b_e16",
-    "ViT-B-32::laion400m_e31",
-    "ViT-B-32::laion400m_e32",
-    "ViT-B-32::openai",
-    "ViT-H-14-378-quickgelu::dfn5b",
-    "ViT-H-14-quickgelu::dfn5b",
-    "ViT-H-14::laion2b-s32b-b79k",
-    "ViT-L-14-336::openai",
-    "ViT-L-14-quickgelu::dfn2b",
-    "ViT-L-14::laion2b-s32b-b82k",
-    "ViT-L-14::laion400m_e31",
-    "ViT-L-14::laion400m_e32",
-    "ViT-L-14::openai",
-    "ViT-L-16-SigLIP-256::webli",
-    "ViT-L-16-SigLIP-384::webli",
-    "ViT-SO400M-14-SigLIP-384::webli",
-    "ViT-g-14::laion2b-s12b-b42k",
-    "nllb-clip-base-siglip::mrl",
-    "nllb-clip-base-siglip::v1",
-    "nllb-clip-large-siglip::mrl",
-    "nllb-clip-large-siglip::v1",
-    "xlm-roberta-base-ViT-B-32::laion5b_s13b_b90k",
-    "xlm-roberta-large-ViT-H-14::frozen_laion5b_s13b_b90k",
-]
-
-# glob to delete old UUID blobs when reuploading models
-uuid_char = "[a-fA-F0-9]"
-uuid_glob = uuid_char * 8 + "-" + uuid_char * 4 + "-" + uuid_char * 4 + "-" + uuid_char * 4 + "-" + uuid_char * 12
-
-# remote repo files to be deleted before uploading
-# deletion is in the same commit as the upload, so it's atomic
-delete_patterns = ["**/*onnx*", "**/Constant*", "**/*.weight", "**/*.bias", f"**/{uuid_glob}"]
-
-with Progress() as progress:
-    task = progress.add_task("[green]Exporting models...", total=len(models))
-    token = os.environ.get("HF_AUTH_TOKEN")
-    torch.backends.mha.set_fastpath_enabled(False)
-    with TemporaryDirectory() as tmp:
-        tmpdir = Path(tmp)
-        for model in models:
-            model_name = model.split("/")[-1].replace("::", "__")
-            hf_model_name = model_name.replace("xlm-roberta-large", "XLM-Roberta-Large")
-            hf_model_name = model_name.replace("xlm-roberta-base", "XLM-Roberta-Base")
-            config_path = tmpdir / model_name / "config.json"
-
-            def export() -> None:
-                progress.update(task, description=f"[green]Exporting {hf_model_name}")
-                visual_dir = tmpdir / hf_model_name / "visual"
-                textual_dir = tmpdir / hf_model_name / "textual"
-                if model.startswith("M-CLIP"):
-                    visual_path, textual_path = mclip.to_onnx(model, visual_dir, textual_dir)
-                else:
-                    name, _, pretrained = model_name.partition("__")
-                    config = openclip.OpenCLIPModelConfig(name, pretrained)
-                    visual_path, textual_path = openclip.to_onnx(config, visual_dir, textual_dir)
-                progress.update(task, description=f"[green]Optimizing {hf_model_name} (visual)")
-                optimize(visual_path)
-                progress.update(task, description=f"[green]Optimizing {hf_model_name} (textual)")
-                optimize(textual_path)
-
-                gc.collect()
-
-            def upload() -> None:
-                progress.update(task, description=f"[yellow]Uploading {hf_model_name}")
-                repo_id = f"immich-app/{hf_model_name}"
-
-                create_repo(repo_id, exist_ok=True)
-                upload_folder(
-                    repo_id=repo_id,
-                    folder_path=tmpdir / hf_model_name,
-                    delete_patterns=delete_patterns,
-                    token=token,
-                )
-
-            export()
-            if token is not None:
-                upload()
-            progress.update(task, advance=1)
--- a/machine-learning/export/uv.lock
+++ b/machine-learning/export/uv.lock
--- a/machine-learning/pyproject.toml
+++ b/machine-learning/pyproject.toml
@ -51,6 +51,7 @@ cpu = ["onnxruntime>=1.15.0,<2"]
 cuda = ["onnxruntime-gpu>=1.17.0,<2"]
 openvino = ["onnxruntime-openvino>=1.17.1,<1.19.0"]
 armnn = ["onnxruntime>=1.15.0,<2"]
+rknn = ["onnxruntime>=1.15.0,<2", "rknn-toolkit-lite2>=2.3.0,<3"]

 [tool.uv]
 compile-bytecode = true
--- a/machine-learning/uv.lock
+++ b/machine-learning/uv.lock
@ -1109,6 +1109,10 @@ cuda = [
 openvino = [
    { name = "onnxruntime-openvino" },
 ]
+rknn = [
+    { name = "onnxruntime" },
+    { name = "rknn-toolkit-lite2" },
+]

 [package.dev-dependencies]
 dev = [
@ -1162,6 +1166,7 @@ requires-dist = [
    { name = "insightface", specifier = ">=0.7.3,<1.0" },
    { name = "onnxruntime", marker = "extra == 'armnn'", specifier = ">=1.15.0,<2" },
    { name = "onnxruntime", marker = "extra == 'cpu'", specifier = ">=1.15.0,<2" },
+    { name = "onnxruntime", marker = "extra == 'rknn'", specifier = ">=1.15.0,<2" },
    { name = "onnxruntime-gpu", marker = "extra == 'cuda'", specifier = ">=1.17.0,<2", index = "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/" },
    { name = "onnxruntime-openvino", marker = "extra == 'openvino'", specifier = ">=1.17.1,<1.19.0" },
    { name = "opencv-python-headless", specifier = ">=4.7.0.72,<5.0" },
@ -1171,10 +1176,11 @@ requires-dist = [
    { name = "pydantic-settings", specifier = ">=2.5.2,<3" },
    { name = "python-multipart", specifier = ">=0.0.6,<1.0" },
    { name = "rich", specifier = ">=13.4.2" },
+    { name = "rknn-toolkit-lite2", marker = "extra == 'rknn'", specifier = ">=2.3.0,<3" },
    { name = "tokenizers", specifier = ">=0.15.0,<1.0" },
    { name = "uvicorn", extras = ["standard"], specifier = ">=0.22.0,<1.0" },
 ]
-provides-extras = ["cpu", "cuda", "openvino", "armnn"]
+provides-extras = ["cpu", "cuda", "openvino", "armnn", "rknn"]

 [package.metadata.requires-dev]
 dev = [
@ -2131,6 +2137,77 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424 },
 ]

+[[package]]
+name = "rknn-toolkit-lite2"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "psutil" },
+    { name = "ruamel-yaml" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ed/77/6af374a4a8cd2aee762a1fb8a3050dcf3f129134bbdc4bb6bed755c4325b/rknn_toolkit_lite2-2.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b6733689bd09a262bcb6ba4744e690dd4b37ebeac4ed427cf45242c4b4ce9a4", size = 559372 },
+    { url = "https://files.pythonhosted.org/packages/9b/0c/76ff1eb09d09ce4394a6959d2343a321d28dd9e604348ffdafceafdc344c/rknn_toolkit_lite2-2.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3e4fefe355dc34a155680e4bcb9e4abb37ebc271f045ec9e0a4a3a018bc5beb", size = 569149 },
+    { url = "https://files.pythonhosted.org/packages/0d/6e/8679562028051b02312212defc6e8c07248953f10dd7ad506e941b575bf3/rknn_toolkit_lite2-2.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37394371d1561f470c553f39869d7c35ff93405dffe3d0d72babf297a2b0aee9", size = 527457 },
+]
+
+[[package]]
+name = "ruamel-yaml"
+version = "0.18.10"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "ruamel-yaml-clib", marker = "python_full_version < '3.13' and platform_python_implementation == 'CPython'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ea/46/f44d8be06b85bc7c4d8c95d658be2b68f27711f279bf9dd0612a5e4794f5/ruamel.yaml-0.18.10.tar.gz", hash = "sha256:20c86ab29ac2153f80a428e1254a8adf686d3383df04490514ca3b79a362db58", size = 143447 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c2/36/dfc1ebc0081e6d39924a2cc53654497f967a084a436bb64402dfce4254d9/ruamel.yaml-0.18.10-py3-none-any.whl", hash = "sha256:30f22513ab2301b3d2b577adc121c6471f28734d3d9728581245f1e76468b4f1", size = 117729 },
+]
+
+[[package]]
+name = "ruamel-yaml-clib"
+version = "0.2.12"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/84/80203abff8ea4993a87d823a5f632e4d92831ef75d404c9fc78d0176d2b5/ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f", size = 225315 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/70/57/40a958e863e299f0c74ef32a3bde9f2d1ea8d69669368c0c502a0997f57f/ruamel.yaml.clib-0.2.12-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:11f891336688faf5156a36293a9c362bdc7c88f03a8a027c2c1d8e0bcde998e5", size = 131301 },
+    { url = "https://files.pythonhosted.org/packages/98/a8/29a3eb437b12b95f50a6bcc3d7d7214301c6c529d8fdc227247fa84162b5/ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:a606ef75a60ecf3d924613892cc603b154178ee25abb3055db5062da811fd969", size = 633728 },
+    { url = "https://files.pythonhosted.org/packages/35/6d/ae05a87a3ad540259c3ad88d71275cbd1c0f2d30ae04c65dcbfb6dcd4b9f/ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd5415dded15c3822597455bc02bcd66e81ef8b7a48cb71a33628fc9fdde39df", size = 722230 },
+    { url = "https://files.pythonhosted.org/packages/7f/b7/20c6f3c0b656fe609675d69bc135c03aac9e3865912444be6339207b6648/ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76", size = 686712 },
+    { url = "https://files.pythonhosted.org/packages/cd/11/d12dbf683471f888d354dac59593873c2b45feb193c5e3e0f2ebf85e68b9/ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6", size = 663936 },
+    { url = "https://files.pythonhosted.org/packages/72/14/4c268f5077db5c83f743ee1daeb236269fa8577133a5cfa49f8b382baf13/ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd", size = 696580 },
+    { url = "https://files.pythonhosted.org/packages/30/fc/8cd12f189c6405a4c1cf37bd633aa740a9538c8e40497c231072d0fef5cf/ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a52d48f4e7bf9005e8f0a89209bf9a73f7190ddf0489eee5eb51377385f59f2a", size = 663393 },
+    { url = "https://files.pythonhosted.org/packages/80/29/c0a017b704aaf3cbf704989785cd9c5d5b8ccec2dae6ac0c53833c84e677/ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da", size = 100326 },
+    { url = "https://files.pythonhosted.org/packages/3a/65/fa39d74db4e2d0cd252355732d966a460a41cd01c6353b820a0952432839/ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28", size = 118079 },
+    { url = "https://files.pythonhosted.org/packages/fb/8f/683c6ad562f558cbc4f7c029abcd9599148c51c54b5ef0f24f2638da9fbb/ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6", size = 132224 },
+    { url = "https://files.pythonhosted.org/packages/3c/d2/b79b7d695e2f21da020bd44c782490578f300dd44f0a4c57a92575758a76/ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d84318609196d6bd6da0edfa25cedfbabd8dbde5140a0a23af29ad4b8f91fb1e", size = 641480 },
+    { url = "https://files.pythonhosted.org/packages/68/6e/264c50ce2a31473a9fdbf4fa66ca9b2b17c7455b31ef585462343818bd6c/ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb43a269eb827806502c7c8efb7ae7e9e9d0573257a46e8e952f4d4caba4f31e", size = 739068 },
+    { url = "https://files.pythonhosted.org/packages/86/29/88c2567bc893c84d88b4c48027367c3562ae69121d568e8a3f3a8d363f4d/ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52", size = 703012 },
+    { url = "https://files.pythonhosted.org/packages/11/46/879763c619b5470820f0cd6ca97d134771e502776bc2b844d2adb6e37753/ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642", size = 704352 },
+    { url = "https://files.pythonhosted.org/packages/02/80/ece7e6034256a4186bbe50dee28cd032d816974941a6abf6a9d65e4228a7/ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2", size = 737344 },
+    { url = "https://files.pythonhosted.org/packages/f0/ca/e4106ac7e80efbabdf4bf91d3d32fc424e41418458251712f5672eada9ce/ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1492a6051dab8d912fc2adeef0e8c72216b24d57bd896ea607cb90bb0c4981d3", size = 714498 },
+    { url = "https://files.pythonhosted.org/packages/67/58/b1f60a1d591b771298ffa0428237afb092c7f29ae23bad93420b1eb10703/ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4", size = 100205 },
+    { url = "https://files.pythonhosted.org/packages/b4/4f/b52f634c9548a9291a70dfce26ca7ebce388235c93588a1068028ea23fcc/ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb", size = 118185 },
+    { url = "https://files.pythonhosted.org/packages/48/41/e7a405afbdc26af961678474a55373e1b323605a4f5e2ddd4a80ea80f628/ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632", size = 133433 },
+    { url = "https://files.pythonhosted.org/packages/ec/b0/b850385604334c2ce90e3ee1013bd911aedf058a934905863a6ea95e9eb4/ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:943f32bc9dedb3abff9879edc134901df92cfce2c3d5c9348f172f62eb2d771d", size = 647362 },
+    { url = "https://files.pythonhosted.org/packages/44/d0/3f68a86e006448fb6c005aee66565b9eb89014a70c491d70c08de597f8e4/ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c3829bb364fdb8e0332c9931ecf57d9be3519241323c5274bd82f709cebc0c", size = 754118 },
+    { url = "https://files.pythonhosted.org/packages/52/a9/d39f3c5ada0a3bb2870d7db41901125dbe2434fa4f12ca8c5b83a42d7c53/ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd", size = 706497 },
+    { url = "https://files.pythonhosted.org/packages/b0/fa/097e38135dadd9ac25aecf2a54be17ddf6e4c23e43d538492a90ab3d71c6/ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31", size = 698042 },
+    { url = "https://files.pythonhosted.org/packages/ec/d5/a659ca6f503b9379b930f13bc6b130c9f176469b73b9834296822a83a132/ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680", size = 745831 },
+    { url = "https://files.pythonhosted.org/packages/db/5d/36619b61ffa2429eeaefaab4f3374666adf36ad8ac6330d855848d7d36fd/ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b82a7c94a498853aa0b272fd5bc67f29008da798d4f93a2f9f289feb8426a58d", size = 715692 },
+    { url = "https://files.pythonhosted.org/packages/b1/82/85cb92f15a4231c89b95dfe08b09eb6adca929ef7df7e17ab59902b6f589/ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5", size = 98777 },
+    { url = "https://files.pythonhosted.org/packages/d7/8f/c3654f6f1ddb75daf3922c3d8fc6005b1ab56671ad56ffb874d908bfa668/ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4", size = 115523 },
+    { url = "https://files.pythonhosted.org/packages/29/00/4864119668d71a5fa45678f380b5923ff410701565821925c69780356ffa/ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a", size = 132011 },
+    { url = "https://files.pythonhosted.org/packages/7f/5e/212f473a93ae78c669ffa0cb051e3fee1139cb2d385d2ae1653d64281507/ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:e7e3736715fbf53e9be2a79eb4db68e4ed857017344d697e8b9749444ae57475", size = 642488 },
+    { url = "https://files.pythonhosted.org/packages/1f/8f/ecfbe2123ade605c49ef769788f79c38ddb1c8fa81e01f4dbf5cf1a44b16/ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7e75b4965e1d4690e93021adfcecccbca7d61c7bddd8e22406ef2ff20d74ef", size = 745066 },
+    { url = "https://files.pythonhosted.org/packages/e2/a9/28f60726d29dfc01b8decdb385de4ced2ced9faeb37a847bd5cf26836815/ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6", size = 701785 },
+    { url = "https://files.pythonhosted.org/packages/84/7e/8e7ec45920daa7f76046578e4f677a3215fe8f18ee30a9cb7627a19d9b4c/ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf", size = 693017 },
+    { url = "https://files.pythonhosted.org/packages/c5/b3/d650eaade4ca225f02a648321e1ab835b9d361c60d51150bac49063b83fa/ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1", size = 741270 },
+    { url = "https://files.pythonhosted.org/packages/87/b8/01c29b924dcbbed75cc45b30c30d565d763b9c4d540545a0eeecffb8f09c/ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f6f3eac23941b32afccc23081e1f50612bdbe4e982012ef4f5797986828cd01", size = 709059 },
+    { url = "https://files.pythonhosted.org/packages/30/8c/ed73f047a73638257aa9377ad356bea4d96125b305c34a28766f4445cc0f/ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6", size = 98583 },
+    { url = "https://files.pythonhosted.org/packages/b0/85/e8e751d8791564dd333d5d9a4eab0a7a115f7e349595417fd50ecae3395c/ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3", size = 115190 },
+]
+
 [[package]]
 name = "ruff"
 version = "0.9.9"
--- a/server/src/constants.ts
+++ b/server/src/constants.ts
@ -96,6 +96,18 @@ export const CLIP_MODEL_INFO: Record<string, ModelInfo> = {
  'ViT-SO400M-14-SigLIP-384__webli': { dimSize: 1152 },
  'nllb-clip-large-siglip__mrl': { dimSize: 1152 },
  'nllb-clip-large-siglip__v1': { dimSize: 1152 },
+  'ViT-B-16-SigLIP2__webli': { dimSize: 768 },
+  'ViT-B-32-SigLIP2-256__webli': { dimSize: 768 },
+  'ViT-L-16-SigLIP2-256__webli': { dimSize: 1024 },
+  'ViT-L-16-SigLIP2-384__webli': { dimSize: 1024 },
+  'ViT-L-16-SigLIP2-512__webli': { dimSize: 1024 },
+  'ViT-SO400M-14-SigLIP2__webli': { dimSize: 1152 },
+  'ViT-SO400M-14-SigLIP2-378__webli': { dimSize: 1152 },
+  'ViT-SO400M-16-SigLIP2-256__webli': { dimSize: 1152 },
+  'ViT-SO400M-16-SigLIP2-384__webli': { dimSize: 1152 },
+  'ViT-SO400M-16-SigLIP2-512__webli': { dimSize: 1152 },
+  'ViT-gopt-16-SigLIP2-256__webli': { dimSize: 1536 },
+  'ViT-gopt-16-SigLIP2-384__webli': { dimSize: 1536 },
 };

 type SharpRotationData = {