mirror of
https://github.com/immich-app/immich.git
synced 2025-01-21 00:52:43 -05:00
chore(ml): updated dockerfile, added typing, packaging (#2642)
* updated dockerfile, added typing, packaging apply env change * added arm64 support * added ml version pump, second try for arm64 * added linting config to pyproject.toml * renamed ml input field * fixed linter config * fixed dev docker compose
This commit is contained in:
parent
c92c442356
commit
1e748864c5
13 changed files with 2647 additions and 67 deletions
|
@ -35,7 +35,7 @@ services:
|
||||||
ports:
|
ports:
|
||||||
- 3003:3003
|
- 3003:3003
|
||||||
volumes:
|
volumes:
|
||||||
- ../machine-learning/src:/usr/src/app
|
- ../machine-learning/app:/usr/src/app
|
||||||
- ${UPLOAD_LOCATION}:/usr/src/app/upload
|
- ${UPLOAD_LOCATION}:/usr/src/app/upload
|
||||||
- model-cache:/cache
|
- model-cache:/cache
|
||||||
env_file:
|
env_file:
|
||||||
|
|
|
@ -1,29 +1,26 @@
|
||||||
FROM python:3.10 as builder
|
FROM python:3.11 as builder
|
||||||
|
|
||||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
PYTHONUNBUFFERED=1 \
|
PYTHONUNBUFFERED=1 \
|
||||||
PIP_NO_CACHE_DIR=true
|
PIP_NO_CACHE_DIR=true
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip && pip install poetry
|
||||||
|
RUN poetry config installer.max-workers 10 && \
|
||||||
|
poetry config virtualenvs.create false
|
||||||
RUN python -m venv /opt/venv
|
RUN python -m venv /opt/venv
|
||||||
RUN /opt/venv/bin/pip install torch --index-url https://download.pytorch.org/whl/cpu
|
ENV VIRTUAL_ENV="/opt/venv" PATH="/opt/venv/bin:${PATH}"
|
||||||
RUN /opt/venv/bin/pip install transformers tqdm numpy scikit-learn scipy nltk sentencepiece fastapi Pillow uvicorn[standard]
|
|
||||||
RUN /opt/venv/bin/pip install --no-deps sentence-transformers
|
|
||||||
# Facial Recognition Stuff
|
|
||||||
RUN /opt/venv/bin/pip install insightface onnxruntime
|
|
||||||
|
|
||||||
FROM python:3.10-slim
|
COPY poetry.lock pyproject.toml ./
|
||||||
|
RUN poetry install --sync --no-interaction --no-ansi --no-root --only main
|
||||||
|
|
||||||
ENV NODE_ENV=production
|
FROM python:3.11-slim
|
||||||
|
WORKDIR /usr/src/app
|
||||||
COPY --from=builder /opt/venv /opt/venv
|
ENV NODE_ENV=production \
|
||||||
|
TRANSFORMERS_CACHE=/cache \
|
||||||
ENV TRANSFORMERS_CACHE=/cache \
|
|
||||||
PYTHONDONTWRITEBYTECODE=1 \
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
PYTHONUNBUFFERED=1 \
|
PYTHONUNBUFFERED=1 \
|
||||||
PATH="/opt/venv/bin:$PATH"
|
PATH="/opt/venv/bin:$PATH" \
|
||||||
|
PYTHONPATH=`pwd`
|
||||||
|
|
||||||
WORKDIR /usr/src/app
|
COPY --from=builder /opt/venv /opt/venv
|
||||||
|
COPY app .
|
||||||
COPY . .
|
ENTRYPOINT ["python", "main.py"]
|
||||||
ENV PYTHONPATH=`pwd`
|
|
||||||
CMD ["python", "src/main.py"]
|
|
||||||
|
|
|
@ -1,5 +1,13 @@
|
||||||
|
|
||||||
# Immich Machine Learning
|
# Immich Machine Learning
|
||||||
|
|
||||||
- Object Detection
|
- Image classification
|
||||||
- Image Classification
|
- CLIP embeddings
|
||||||
|
- Facial recognition
|
||||||
|
|
||||||
|
# Setup
|
||||||
|
|
||||||
|
This project uses [Poetry](https://python-poetry.org/docs/#installation), so be sure to install it first.
|
||||||
|
Running `poetry install --no-root --with dev` will install everything you need in an isolated virtual environment.
|
||||||
|
|
||||||
|
To add or remove dependencies, you can use the commands `poetry add $PACKAGE_NAME` and `poetry remove $PACKAGE_NAME`, respectively.
|
||||||
|
Be sure to commit the `poetry.lock` and `pyproject.toml` files to reflect any changes in dependencies.
|
||||||
|
|
|
@ -1,22 +1,23 @@
|
||||||
import os
|
import os
|
||||||
import numpy as np
|
from typing import Any
|
||||||
|
from schemas import (
|
||||||
|
EmbeddingResponse,
|
||||||
|
FaceResponse,
|
||||||
|
TagResponse,
|
||||||
|
MessageResponse,
|
||||||
|
TextModelRequest,
|
||||||
|
TextResponse,
|
||||||
|
VisionModelRequest,
|
||||||
|
)
|
||||||
import cv2 as cv
|
import cv2 as cv
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|
||||||
from insightface.app import FaceAnalysis
|
from insightface.app import FaceAnalysis
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from transformers import Pipeline
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
|
|
||||||
class MlRequestBody(BaseModel):
|
|
||||||
thumbnailPath: str
|
|
||||||
|
|
||||||
|
|
||||||
class ClipRequestBody(BaseModel):
|
|
||||||
text: str
|
|
||||||
|
|
||||||
|
|
||||||
classification_model = os.getenv(
|
classification_model = os.getenv(
|
||||||
|
@ -42,7 +43,7 @@ app = FastAPI()
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("startup")
|
@app.on_event("startup")
|
||||||
async def startup_event():
|
async def startup_event() -> None:
|
||||||
models = [
|
models = [
|
||||||
(classification_model, "image-classification"),
|
(classification_model, "image-classification"),
|
||||||
(clip_image_model, "clip"),
|
(clip_image_model, "clip"),
|
||||||
|
@ -58,42 +59,51 @@ async def startup_event():
|
||||||
_get_model(model_name, model_type)
|
_get_model(model_name, model_type)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/", response_model=MessageResponse)
|
||||||
async def root():
|
async def root() -> dict[str, str]:
|
||||||
return {"message": "Immich ML"}
|
return {"message": "Immich ML"}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/ping")
|
@app.get("/ping", response_model=TextResponse)
|
||||||
def ping():
|
def ping() -> str:
|
||||||
return "pong"
|
return "pong"
|
||||||
|
|
||||||
|
|
||||||
@app.post("/image-classifier/tag-image", status_code=200)
|
@app.post("/image-classifier/tag-image", response_model=TagResponse, status_code=200)
|
||||||
def image_classification(payload: MlRequestBody):
|
def image_classification(payload: VisionModelRequest) -> list[str]:
|
||||||
model = get_cached_model(classification_model, "image-classification")
|
model = get_cached_model(classification_model, "image-classification")
|
||||||
assetPath = payload.thumbnailPath
|
assetPath = payload.image_path
|
||||||
return run_engine(model, assetPath)
|
labels = run_engine(model, assetPath)
|
||||||
|
return labels
|
||||||
|
|
||||||
|
|
||||||
@app.post("/sentence-transformer/encode-image", status_code=200)
|
@app.post(
|
||||||
def clip_encode_image(payload: MlRequestBody):
|
"/sentence-transformer/encode-image",
|
||||||
|
response_model=EmbeddingResponse,
|
||||||
|
status_code=200,
|
||||||
|
)
|
||||||
|
def clip_encode_image(payload: VisionModelRequest) -> list[float]:
|
||||||
model = get_cached_model(clip_image_model, "clip")
|
model = get_cached_model(clip_image_model, "clip")
|
||||||
assetPath = payload.thumbnailPath
|
image = Image.open(payload.image_path)
|
||||||
return model.encode(Image.open(assetPath)).tolist()
|
return model.encode(image).tolist()
|
||||||
|
|
||||||
|
|
||||||
@app.post("/sentence-transformer/encode-text", status_code=200)
|
@app.post(
|
||||||
def clip_encode_text(payload: ClipRequestBody):
|
"/sentence-transformer/encode-text",
|
||||||
|
response_model=EmbeddingResponse,
|
||||||
|
status_code=200,
|
||||||
|
)
|
||||||
|
def clip_encode_text(payload: TextModelRequest) -> list[float]:
|
||||||
model = get_cached_model(clip_text_model, "clip")
|
model = get_cached_model(clip_text_model, "clip")
|
||||||
text = payload.text
|
return model.encode(payload.text).tolist()
|
||||||
return model.encode(text).tolist()
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/facial-recognition/detect-faces", status_code=200)
|
@app.post(
|
||||||
def facial_recognition(payload: MlRequestBody):
|
"/facial-recognition/detect-faces", response_model=FaceResponse, status_code=200
|
||||||
|
)
|
||||||
|
def facial_recognition(payload: VisionModelRequest) -> list[dict[str, Any]]:
|
||||||
model = get_cached_model(facial_recognition_model, "facial-recognition")
|
model = get_cached_model(facial_recognition_model, "facial-recognition")
|
||||||
assetPath = payload.thumbnailPath
|
img = cv.imread(payload.image_path)
|
||||||
img = cv.imread(assetPath)
|
|
||||||
height, width, _ = img.shape
|
height, width, _ = img.shape
|
||||||
results = []
|
results = []
|
||||||
faces = model.get(img)
|
faces = model.get(img)
|
||||||
|
@ -120,11 +130,11 @@ def facial_recognition(payload: MlRequestBody):
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def run_engine(engine, path):
|
def run_engine(engine: Pipeline, path: str) -> list[str]:
|
||||||
result = []
|
result: list[str] = []
|
||||||
predictions = engine(path)
|
predictions: list[dict[str, Any]] = engine(path) # type: ignore
|
||||||
|
|
||||||
for index, pred in enumerate(predictions):
|
for pred in predictions:
|
||||||
tags = pred["label"].split(", ")
|
tags = pred["label"].split(", ")
|
||||||
if pred["score"] > min_tag_score:
|
if pred["score"] > min_tag_score:
|
||||||
result = [*result, *tags]
|
result = [*result, *tags]
|
||||||
|
@ -135,7 +145,7 @@ def run_engine(engine, path):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_cached_model(model, task):
|
def get_cached_model(model, task) -> Any:
|
||||||
global _model_cache
|
global _model_cache
|
||||||
key = "|".join([model, str(task)])
|
key = "|".join([model, str(task)])
|
||||||
if key not in _model_cache:
|
if key not in _model_cache:
|
||||||
|
@ -145,7 +155,7 @@ def get_cached_model(model, task):
|
||||||
return _model_cache[key]
|
return _model_cache[key]
|
||||||
|
|
||||||
|
|
||||||
def _get_model(model, task):
|
def _get_model(model, task) -> Any:
|
||||||
match task:
|
match task:
|
||||||
case "facial-recognition":
|
case "facial-recognition":
|
||||||
model = FaceAnalysis(
|
model = FaceAnalysis(
|
64
machine-learning/app/schemas.py
Normal file
64
machine-learning/app/schemas.py
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
def to_lower_camel(string: str) -> str:
|
||||||
|
tokens = [
|
||||||
|
token.capitalize() if i > 0 else token
|
||||||
|
for i, token in enumerate(string.split("_"))
|
||||||
|
]
|
||||||
|
return "".join(tokens)
|
||||||
|
|
||||||
|
|
||||||
|
class VisionModelRequest(BaseModel):
|
||||||
|
image_path: str
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
alias_generator = to_lower_camel
|
||||||
|
allow_population_by_field_name = True
|
||||||
|
|
||||||
|
|
||||||
|
class TextModelRequest(BaseModel):
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
|
class TextResponse(BaseModel):
|
||||||
|
__root__: str
|
||||||
|
|
||||||
|
|
||||||
|
class MessageResponse(BaseModel):
|
||||||
|
message: str
|
||||||
|
|
||||||
|
|
||||||
|
class TagResponse(BaseModel):
|
||||||
|
__root__: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
class Embedding(BaseModel):
|
||||||
|
__root__: list[float]
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingResponse(BaseModel):
|
||||||
|
__root__: Embedding
|
||||||
|
|
||||||
|
|
||||||
|
class BoundingBox(BaseModel):
|
||||||
|
x1: int
|
||||||
|
y1: int
|
||||||
|
x2: int
|
||||||
|
y2: int
|
||||||
|
|
||||||
|
|
||||||
|
class Face(BaseModel):
|
||||||
|
image_width: int
|
||||||
|
image_height: int
|
||||||
|
bounding_box: BoundingBox
|
||||||
|
score: float
|
||||||
|
embedding: Embedding
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
alias_generator = to_lower_camel
|
||||||
|
allow_population_by_field_name = True
|
||||||
|
|
||||||
|
|
||||||
|
class FaceResponse(BaseModel):
|
||||||
|
__root__: list[Face]
|
2444
machine-learning/poetry.lock
generated
Normal file
2444
machine-learning/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
56
machine-learning/pyproject.toml
Normal file
56
machine-learning/pyproject.toml
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
[tool.poetry]
|
||||||
|
name = "machine-learning"
|
||||||
|
version = "1.59.1"
|
||||||
|
description = ""
|
||||||
|
authors = ["Hau Tran <alex.tran1502@gmail.com>"]
|
||||||
|
readme = "README.md"
|
||||||
|
packages = [{include = "app"}]
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.11"
|
||||||
|
torch = [
|
||||||
|
{markers = "platform_machine == 'arm64' or platform_machine == 'aarch64'", version = "=2.0.1", source = "pypi"},
|
||||||
|
{markers = "platform_machine == 'amd64' or platform_machine == 'x86_64'", version = "=2.0.1+cpu", source = "pytorch-cpu"}
|
||||||
|
]
|
||||||
|
transformers = "^4.29.2"
|
||||||
|
sentence-transformers = "^2.2.2"
|
||||||
|
onnxruntime = "^1.15.0"
|
||||||
|
insightface = "^0.7.3"
|
||||||
|
opencv-python-headless = "^4.7.0.72"
|
||||||
|
pillow = "^9.5.0"
|
||||||
|
fastapi = "^0.95.2"
|
||||||
|
uvicorn = {extras = ["standard"], version = "^0.22.0"}
|
||||||
|
pydantic = "^1.10.8"
|
||||||
|
|
||||||
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
mypy = "^1.3.0"
|
||||||
|
black = "^23.3.0"
|
||||||
|
pytest = "^7.3.1"
|
||||||
|
|
||||||
|
[[tool.poetry.source]]
|
||||||
|
name = "pytorch-cpu"
|
||||||
|
url = "https://download.pytorch.org/whl/cpu"
|
||||||
|
priority = "explicit"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
|
[tool.flake8]
|
||||||
|
max-line-length = 120
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
python_version = "3.11"
|
||||||
|
plugins = "pydantic.mypy"
|
||||||
|
follow_imports = "silent"
|
||||||
|
warn_redundant_casts = true
|
||||||
|
disallow_any_generics = true
|
||||||
|
check_untyped_defs = true
|
||||||
|
no_implicit_reexport = true
|
||||||
|
disallow_untyped_defs = true
|
||||||
|
|
||||||
|
[tool.pydantic-mypy]
|
||||||
|
init_forbid_extra = true
|
||||||
|
init_typed = true
|
||||||
|
warn_required_dynamic_aliases = true
|
||||||
|
warn_untyped_fields = true
|
|
@ -63,6 +63,7 @@ if [ "$CURRENT_SERVER" != "$NEXT_SERVER" ]; then
|
||||||
echo "Pumping Server: $CURRENT_SERVER => $NEXT_SERVER"
|
echo "Pumping Server: $CURRENT_SERVER => $NEXT_SERVER"
|
||||||
npm --prefix server version $SERVER_PUMP
|
npm --prefix server version $SERVER_PUMP
|
||||||
npm --prefix server run api:generate
|
npm --prefix server run api:generate
|
||||||
|
poetry --directory machine-learning version $SERVER_PUMP
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$CURRENT_MOBILE" != "$NEXT_MOBILE" ]; then
|
if [ "$CURRENT_MOBILE" != "$NEXT_MOBILE" ]; then
|
||||||
|
|
|
@ -175,7 +175,7 @@ describe(FacialRecognitionService.name, () => {
|
||||||
assetMock.getByIds.mockResolvedValue([assetEntityStub.image]);
|
assetMock.getByIds.mockResolvedValue([assetEntityStub.image]);
|
||||||
await sut.handleRecognizeFaces({ id: assetEntityStub.image.id });
|
await sut.handleRecognizeFaces({ id: assetEntityStub.image.id });
|
||||||
expect(machineLearningMock.detectFaces).toHaveBeenCalledWith({
|
expect(machineLearningMock.detectFaces).toHaveBeenCalledWith({
|
||||||
thumbnailPath: assetEntityStub.image.resizePath,
|
imagePath: assetEntityStub.image.resizePath,
|
||||||
});
|
});
|
||||||
expect(faceMock.create).not.toHaveBeenCalled();
|
expect(faceMock.create).not.toHaveBeenCalled();
|
||||||
expect(jobMock.queue).not.toHaveBeenCalled();
|
expect(jobMock.queue).not.toHaveBeenCalled();
|
||||||
|
|
|
@ -54,7 +54,7 @@ export class FacialRecognitionService {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const faces = await this.machineLearning.detectFaces({ thumbnailPath: asset.resizePath });
|
const faces = await this.machineLearning.detectFaces({ imagePath: asset.resizePath });
|
||||||
|
|
||||||
this.logger.debug(`${faces.length} faces detected in ${asset.resizePath}`);
|
this.logger.debug(`${faces.length} faces detected in ${asset.resizePath}`);
|
||||||
this.logger.verbose(faces.map((face) => ({ ...face, embedding: `float[${face.embedding.length}]` })));
|
this.logger.verbose(faces.map((face) => ({ ...face, embedding: `float[${face.embedding.length}]` })));
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
export const IMachineLearningRepository = 'IMachineLearningRepository';
|
export const IMachineLearningRepository = 'IMachineLearningRepository';
|
||||||
|
|
||||||
export interface MachineLearningInput {
|
export interface MachineLearningInput {
|
||||||
thumbnailPath: string;
|
imagePath: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface BoundingBox {
|
export interface BoundingBox {
|
||||||
|
|
|
@ -84,7 +84,7 @@ describe(SmartInfoService.name, () => {
|
||||||
|
|
||||||
await sut.handleClassifyImage({ id: asset.id });
|
await sut.handleClassifyImage({ id: asset.id });
|
||||||
|
|
||||||
expect(machineMock.classifyImage).toHaveBeenCalledWith({ thumbnailPath: 'path/to/resize.ext' });
|
expect(machineMock.classifyImage).toHaveBeenCalledWith({ imagePath: 'path/to/resize.ext' });
|
||||||
expect(smartMock.upsert).toHaveBeenCalledWith({
|
expect(smartMock.upsert).toHaveBeenCalledWith({
|
||||||
assetId: 'asset-1',
|
assetId: 'asset-1',
|
||||||
tags: ['tag1', 'tag2', 'tag3'],
|
tags: ['tag1', 'tag2', 'tag3'],
|
||||||
|
@ -143,7 +143,7 @@ describe(SmartInfoService.name, () => {
|
||||||
|
|
||||||
await sut.handleEncodeClip({ id: asset.id });
|
await sut.handleEncodeClip({ id: asset.id });
|
||||||
|
|
||||||
expect(machineMock.encodeImage).toHaveBeenCalledWith({ thumbnailPath: 'path/to/resize.ext' });
|
expect(machineMock.encodeImage).toHaveBeenCalledWith({ imagePath: 'path/to/resize.ext' });
|
||||||
expect(smartMock.upsert).toHaveBeenCalledWith({
|
expect(smartMock.upsert).toHaveBeenCalledWith({
|
||||||
assetId: 'asset-1',
|
assetId: 'asset-1',
|
||||||
clipEmbedding: [0.01, 0.02, 0.03],
|
clipEmbedding: [0.01, 0.02, 0.03],
|
||||||
|
|
|
@ -40,7 +40,7 @@ export class SmartInfoService {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const tags = await this.machineLearning.classifyImage({ thumbnailPath: asset.resizePath });
|
const tags = await this.machineLearning.classifyImage({ imagePath: asset.resizePath });
|
||||||
if (tags.length === 0) {
|
if (tags.length === 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -73,7 +73,7 @@ export class SmartInfoService {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const clipEmbedding = await this.machineLearning.encodeImage({ thumbnailPath: asset.resizePath });
|
const clipEmbedding = await this.machineLearning.encodeImage({ imagePath: asset.resizePath });
|
||||||
await this.repository.upsert({ assetId: asset.id, clipEmbedding: clipEmbedding });
|
await this.repository.upsert({ assetId: asset.id, clipEmbedding: clipEmbedding });
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
Loading…
Add table
Reference in a new issue