From a88f551b6a6b8bc6f3ca76f1b4af1d188ffdaa0a Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 13:58:52 +0530 Subject: [PATCH] WIP IPC API --- desktop/src/main/services/ml-face.ts | 24 ++++ .../machineLearning/machineLearningFactory.ts | 3 - .../mobileFaceNetEmbeddingService.ts | 6 - .../yoloFaceDetectionService.ts | 116 ++++++------------ .../photos/src/types/machineLearning/index.ts | 3 +- web/packages/next/types/ipc.ts | 21 +++- 6 files changed, 79 insertions(+), 94 deletions(-) diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts index f88f432ee..bf8eea162 100644 --- a/desktop/src/main/services/ml-face.ts +++ b/desktop/src/main/services/ml-face.ts @@ -78,6 +78,30 @@ const faceEmbeddingSession = async () => { return _faceEmbeddingSession; }; +private async initOnnx() { + console.log("start ort"); + this.onnxInferenceSession = await ort.InferenceSession.create( + "/models/yoloface/yolov5s_face_640_640_dynamic.onnx", + ); + const data = new Float32Array(1 * 3 * 640 * 640); + const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]); + // TODO(MR): onnx-yolo + // const feeds: Record = {}; + const feeds: Record = {}; + const name = this.onnxInferenceSession.inputNames[0]; + feeds[name] = inputTensor; + await this.onnxInferenceSession.run(feeds); + console.log("start end"); +} + +private async getOnnxInferenceSession() { + if (!this.onnxInferenceSession) { + await this.initOnnx(); + } + return this.onnxInferenceSession; +} + + // export const clipImageEmbedding = async (jpegImageData: Uint8Array) => { // const tempFilePath = await generateTempFilePath(""); // const imageStream = new Response(jpegImageData.buffer).body; diff --git a/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts b/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts index 36e37d9b8..991ae6808 100644 --- a/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts +++ b/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts @@ -203,9 +203,6 @@ export class LocalMLSyncContext implements MLSyncContext { } public async dispose() { - // await this.faceDetectionService.dispose(); - // await this.faceEmbeddingService.dispose(); - this.localFilesMap = undefined; await this.syncQueue.onIdle(); this.syncQueue.removeAllListeners(); diff --git a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts index 39953689e..6b2450a24 100644 --- a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts +++ b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts @@ -96,12 +96,6 @@ class MobileFaceNetEmbeddingService implements FaceEmbeddingService { } return embeddings; } - - public async dispose() { - const inferenceSession = await this.getOnnxInferenceSession(); - inferenceSession?.release(); - this.onnxInferenceSession = undefined; - } } export default new MobileFaceNetEmbeddingService(); diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts index 71b51f674..02e5bb02b 100644 --- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts +++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts @@ -1,4 +1,5 @@ import { MAX_FACE_DISTANCE_PERCENT } from "constants/mlConfig"; +import { euclidean } from "hdbscan"; import { Matrix, applyToPoint, @@ -21,17 +22,7 @@ import { import { newBox } from "utils/machineLearning"; import { Box, Point } from "../../../thirdparty/face-api/classes"; -// TODO(MR): onnx-yolo -// import * as ort from "onnxruntime-web"; -// import { env } from "onnxruntime-web"; -const ort: any = {}; - -// TODO(MR): onnx-yolo -// env.wasm.wasmPaths = "/js/onnx/"; class YoloFaceDetectionService implements FaceDetectionService { - // TODO(MR): onnx-yolo - // private onnxInferenceSession?: ort.InferenceSession; - private onnxInferenceSession?: any; public method: Versioned; public constructor() { @@ -41,27 +32,44 @@ class YoloFaceDetectionService implements FaceDetectionService { }; } - private async initOnnx() { - console.log("start ort"); - this.onnxInferenceSession = await ort.InferenceSession.create( - "/models/yoloface/yolov5s_face_640_640_dynamic.onnx", - ); - const data = new Float32Array(1 * 3 * 640 * 640); + public async detectFaces( + imageBitmap: ImageBitmap, + ): Promise> { + const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT; + const preprocessResult = + this.preprocessImageBitmapToFloat32ChannelsFirst( + imageBitmap, + 640, + 640, + ); + const data = preprocessResult.data; + const resized = preprocessResult.newSize; const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]); // TODO(MR): onnx-yolo // const feeds: Record = {}; const feeds: Record = {}; - const name = this.onnxInferenceSession.inputNames[0]; - feeds[name] = inputTensor; - await this.onnxInferenceSession.run(feeds); - console.log("start end"); - } - - private async getOnnxInferenceSession() { - if (!this.onnxInferenceSession) { - await this.initOnnx(); - } - return this.onnxInferenceSession; + feeds["input"] = inputTensor; + const inferenceSession = await this.getOnnxInferenceSession(); + const runout = await inferenceSession.run(feeds); + const outputData = runout.output.data; + const faces = this.getFacesFromYoloOutput( + outputData as Float32Array, + 0.7, + ); + const inBox = newBox(0, 0, resized.width, resized.height); + const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height); + const transform = computeTransformToBox(inBox, toBox); + const faceDetections: Array = faces?.map((f) => { + const box = transformBox(f.box, transform); + const normLandmarks = f.landmarks; + const landmarks = transformPoints(normLandmarks, transform); + return { + box, + landmarks, + probability: f.probability as number, + } as FaceDetection; + }); + return removeDuplicateDetections(faceDetections, maxFaceDistance); } private preprocessImageBitmapToFloat32ChannelsFirst( @@ -233,64 +241,10 @@ class YoloFaceDetectionService implements FaceDetectionService { probability: faceDetection.probability, }; } - - private async estimateOnnx(imageBitmap: ImageBitmap) { - const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT; - const preprocessResult = - this.preprocessImageBitmapToFloat32ChannelsFirst( - imageBitmap, - 640, - 640, - ); - const data = preprocessResult.data; - const resized = preprocessResult.newSize; - const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]); - // TODO(MR): onnx-yolo - // const feeds: Record = {}; - const feeds: Record = {}; - feeds["input"] = inputTensor; - const inferenceSession = await this.getOnnxInferenceSession(); - const runout = await inferenceSession.run(feeds); - const outputData = runout.output.data; - const faces = this.getFacesFromYoloOutput( - outputData as Float32Array, - 0.7, - ); - const inBox = newBox(0, 0, resized.width, resized.height); - const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height); - const transform = computeTransformToBox(inBox, toBox); - const faceDetections: Array = faces?.map((f) => { - const box = transformBox(f.box, transform); - const normLandmarks = f.landmarks; - const landmarks = transformPoints(normLandmarks, transform); - return { - box, - landmarks, - probability: f.probability as number, - } as FaceDetection; - }); - return removeDuplicateDetections(faceDetections, maxFaceDistance); - } - - public async detectFaces( - imageBitmap: ImageBitmap, - ): Promise> { - // measure time taken - const facesFromOnnx = await this.estimateOnnx(imageBitmap); - return facesFromOnnx; - } - - public async dispose() { - const inferenceSession = await this.getOnnxInferenceSession(); - inferenceSession?.release(); - this.onnxInferenceSession = undefined; - } } export default new YoloFaceDetectionService(); -import { euclidean } from "hdbscan"; - /** * Removes duplicate face detections from an array of detections. * diff --git a/web/apps/photos/src/types/machineLearning/index.ts b/web/apps/photos/src/types/machineLearning/index.ts index 3def20a08..399990696 100644 --- a/web/apps/photos/src/types/machineLearning/index.ts +++ b/web/apps/photos/src/types/machineLearning/index.ts @@ -261,13 +261,12 @@ export declare type MLIndex = "files" | "people"; export interface FaceDetectionService { method: Versioned; - // init(): Promise; + detectFaces(image: ImageBitmap): Promise>; getRelativeDetection( faceDetection: FaceDetection, imageDimensions: Dimensions, ): FaceDetection; - dispose(): Promise; } export interface FaceCropService { diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts index a0bc07d9a..83d9ee6bd 100644 --- a/web/packages/next/types/ipc.ts +++ b/web/packages/next/types/ipc.ts @@ -196,7 +196,7 @@ export interface Electron { // - ML /** - * Compute and return a CLIP embedding of the given image. + * Return a CLIP embedding of the given image. * * See: [Note: CLIP based magic search] * @@ -207,7 +207,7 @@ export interface Electron { clipImageEmbedding: (jpegImageData: Uint8Array) => Promise; /** - * Compute and return a CLIP embedding of the given image. + * Return a CLIP embedding of the given image. * * See: [Note: CLIP based magic search] * @@ -217,6 +217,23 @@ export interface Electron { */ clipTextEmbedding: (text: string) => Promise; + /** + * Detect faces in the given image using YOLO. + * + * Both the input and output are opaque binary data whose internal structure + * is model (YOLO) and our implementation specific. That said, specifically + * the {@link inputImage} a particular bitmap encoding of an image. + */ + detectFaces: (inputImage: Uint8Array) => Promise; + + /** + * Return a mobilefacenet embedding for the given face data. + * + * Both the input and output are opaque binary data whose internal structure + * is model (mobilefacenet) and our implementation specific. + */ + faceEmbedding: (input: Float32Array) => Promise; + // - File selection // TODO: Deprecated - use dialogs on the renderer process itself