From a88f551b6a6b8bc6f3ca76f1b4af1d188ffdaa0a Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 13:58:52 +0530
Subject: [PATCH] WIP IPC API

---
 desktop/src/main/services/ml-face.ts          |  24 ++++
 .../machineLearning/machineLearningFactory.ts |   3 -
 .../mobileFaceNetEmbeddingService.ts          |   6 -
 .../yoloFaceDetectionService.ts               | 116 ++++++------------
 .../photos/src/types/machineLearning/index.ts |   3 +-
 web/packages/next/types/ipc.ts                |  21 +++-
 6 files changed, 79 insertions(+), 94 deletions(-)

diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts
index f88f432ee..bf8eea162 100644
--- a/desktop/src/main/services/ml-face.ts
+++ b/desktop/src/main/services/ml-face.ts
@@ -78,6 +78,30 @@ const faceEmbeddingSession = async () => {
     return _faceEmbeddingSession;
 };
 
+private async initOnnx() {
+    console.log("start ort");
+    this.onnxInferenceSession = await ort.InferenceSession.create(
+        "/models/yoloface/yolov5s_face_640_640_dynamic.onnx",
+    );
+    const data = new Float32Array(1 * 3 * 640 * 640);
+    const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
+    // TODO(MR): onnx-yolo
+    // const feeds: Record<string, ort.Tensor> = {};
+    const feeds: Record<string, any> = {};
+    const name = this.onnxInferenceSession.inputNames[0];
+    feeds[name] = inputTensor;
+    await this.onnxInferenceSession.run(feeds);
+    console.log("start end");
+}
+
+private async getOnnxInferenceSession() {
+    if (!this.onnxInferenceSession) {
+        await this.initOnnx();
+    }
+    return this.onnxInferenceSession;
+}
+
+
 // export const clipImageEmbedding = async (jpegImageData: Uint8Array) => {
 //     const tempFilePath = await generateTempFilePath("");
 //     const imageStream = new Response(jpegImageData.buffer).body;
diff --git a/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts b/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts
index 36e37d9b8..991ae6808 100644
--- a/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts
+++ b/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts
@@ -203,9 +203,6 @@ export class LocalMLSyncContext implements MLSyncContext {
     }
 
     public async dispose() {
-        // await this.faceDetectionService.dispose();
-        // await this.faceEmbeddingService.dispose();
-
         this.localFilesMap = undefined;
         await this.syncQueue.onIdle();
         this.syncQueue.removeAllListeners();
diff --git a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
index 39953689e..6b2450a24 100644
--- a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
+++ b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
@@ -96,12 +96,6 @@ class MobileFaceNetEmbeddingService implements FaceEmbeddingService {
         }
         return embeddings;
     }
-
-    public async dispose() {
-        const inferenceSession = await this.getOnnxInferenceSession();
-        inferenceSession?.release();
-        this.onnxInferenceSession = undefined;
-    }
 }
 
 export default new MobileFaceNetEmbeddingService();
diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
index 71b51f674..02e5bb02b 100644
--- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
+++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
@@ -1,4 +1,5 @@
 import { MAX_FACE_DISTANCE_PERCENT } from "constants/mlConfig";
+import { euclidean } from "hdbscan";
 import {
     Matrix,
     applyToPoint,
@@ -21,17 +22,7 @@ import {
 import { newBox } from "utils/machineLearning";
 import { Box, Point } from "../../../thirdparty/face-api/classes";
 
-// TODO(MR): onnx-yolo
-// import * as ort from "onnxruntime-web";
-// import { env } from "onnxruntime-web";
-const ort: any = {};
-
-// TODO(MR): onnx-yolo
-// env.wasm.wasmPaths = "/js/onnx/";
 class YoloFaceDetectionService implements FaceDetectionService {
-    // TODO(MR): onnx-yolo
-    // private onnxInferenceSession?: ort.InferenceSession;
-    private onnxInferenceSession?: any;
     public method: Versioned<FaceDetectionMethod>;
 
     public constructor() {
@@ -41,27 +32,44 @@ class YoloFaceDetectionService implements FaceDetectionService {
         };
     }
 
-    private async initOnnx() {
-        console.log("start ort");
-        this.onnxInferenceSession = await ort.InferenceSession.create(
-            "/models/yoloface/yolov5s_face_640_640_dynamic.onnx",
-        );
-        const data = new Float32Array(1 * 3 * 640 * 640);
+    public async detectFaces(
+        imageBitmap: ImageBitmap,
+    ): Promise<Array<FaceDetection>> {
+        const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT;
+        const preprocessResult =
+            this.preprocessImageBitmapToFloat32ChannelsFirst(
+                imageBitmap,
+                640,
+                640,
+            );
+        const data = preprocessResult.data;
+        const resized = preprocessResult.newSize;
         const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
         // TODO(MR): onnx-yolo
         // const feeds: Record<string, ort.Tensor> = {};
         const feeds: Record<string, any> = {};
-        const name = this.onnxInferenceSession.inputNames[0];
-        feeds[name] = inputTensor;
-        await this.onnxInferenceSession.run(feeds);
-        console.log("start end");
-    }
-
-    private async getOnnxInferenceSession() {
-        if (!this.onnxInferenceSession) {
-            await this.initOnnx();
-        }
-        return this.onnxInferenceSession;
+        feeds["input"] = inputTensor;
+        const inferenceSession = await this.getOnnxInferenceSession();
+        const runout = await inferenceSession.run(feeds);
+        const outputData = runout.output.data;
+        const faces = this.getFacesFromYoloOutput(
+            outputData as Float32Array,
+            0.7,
+        );
+        const inBox = newBox(0, 0, resized.width, resized.height);
+        const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height);
+        const transform = computeTransformToBox(inBox, toBox);
+        const faceDetections: Array<FaceDetection> = faces?.map((f) => {
+            const box = transformBox(f.box, transform);
+            const normLandmarks = f.landmarks;
+            const landmarks = transformPoints(normLandmarks, transform);
+            return {
+                box,
+                landmarks,
+                probability: f.probability as number,
+            } as FaceDetection;
+        });
+        return removeDuplicateDetections(faceDetections, maxFaceDistance);
     }
 
     private preprocessImageBitmapToFloat32ChannelsFirst(
@@ -233,64 +241,10 @@ class YoloFaceDetectionService implements FaceDetectionService {
             probability: faceDetection.probability,
         };
     }
-
-    private async estimateOnnx(imageBitmap: ImageBitmap) {
-        const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT;
-        const preprocessResult =
-            this.preprocessImageBitmapToFloat32ChannelsFirst(
-                imageBitmap,
-                640,
-                640,
-            );
-        const data = preprocessResult.data;
-        const resized = preprocessResult.newSize;
-        const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
-        // TODO(MR): onnx-yolo
-        // const feeds: Record<string, ort.Tensor> = {};
-        const feeds: Record<string, any> = {};
-        feeds["input"] = inputTensor;
-        const inferenceSession = await this.getOnnxInferenceSession();
-        const runout = await inferenceSession.run(feeds);
-        const outputData = runout.output.data;
-        const faces = this.getFacesFromYoloOutput(
-            outputData as Float32Array,
-            0.7,
-        );
-        const inBox = newBox(0, 0, resized.width, resized.height);
-        const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height);
-        const transform = computeTransformToBox(inBox, toBox);
-        const faceDetections: Array<FaceDetection> = faces?.map((f) => {
-            const box = transformBox(f.box, transform);
-            const normLandmarks = f.landmarks;
-            const landmarks = transformPoints(normLandmarks, transform);
-            return {
-                box,
-                landmarks,
-                probability: f.probability as number,
-            } as FaceDetection;
-        });
-        return removeDuplicateDetections(faceDetections, maxFaceDistance);
-    }
-
-    public async detectFaces(
-        imageBitmap: ImageBitmap,
-    ): Promise<Array<FaceDetection>> {
-        // measure time taken
-        const facesFromOnnx = await this.estimateOnnx(imageBitmap);
-        return facesFromOnnx;
-    }
-
-    public async dispose() {
-        const inferenceSession = await this.getOnnxInferenceSession();
-        inferenceSession?.release();
-        this.onnxInferenceSession = undefined;
-    }
 }
 
 export default new YoloFaceDetectionService();
 
-import { euclidean } from "hdbscan";
-
 /**
  * Removes duplicate face detections from an array of detections.
  *
diff --git a/web/apps/photos/src/types/machineLearning/index.ts b/web/apps/photos/src/types/machineLearning/index.ts
index 3def20a08..399990696 100644
--- a/web/apps/photos/src/types/machineLearning/index.ts
+++ b/web/apps/photos/src/types/machineLearning/index.ts
@@ -261,13 +261,12 @@ export declare type MLIndex = "files" | "people";
 
 export interface FaceDetectionService {
     method: Versioned<FaceDetectionMethod>;
-    // init(): Promise<void>;
+
     detectFaces(image: ImageBitmap): Promise<Array<FaceDetection>>;
     getRelativeDetection(
         faceDetection: FaceDetection,
         imageDimensions: Dimensions,
     ): FaceDetection;
-    dispose(): Promise<void>;
 }
 
 export interface FaceCropService {
diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts
index a0bc07d9a..83d9ee6bd 100644
--- a/web/packages/next/types/ipc.ts
+++ b/web/packages/next/types/ipc.ts
@@ -196,7 +196,7 @@ export interface Electron {
     // - ML
 
     /**
-     * Compute and return a CLIP embedding of the given image.
+     * Return a CLIP embedding of the given image.
      *
      * See: [Note: CLIP based magic search]
      *
@@ -207,7 +207,7 @@ export interface Electron {
     clipImageEmbedding: (jpegImageData: Uint8Array) => Promise<Float32Array>;
 
     /**
-     * Compute and return a CLIP embedding of the given image.
+     * Return a CLIP embedding of the given image.
      *
      * See: [Note: CLIP based magic search]
      *
@@ -217,6 +217,23 @@ export interface Electron {
      */
     clipTextEmbedding: (text: string) => Promise<Float32Array>;
 
+    /**
+     * Detect faces in the given image using YOLO.
+     *
+     * Both the input and output are opaque binary data whose internal structure
+     * is model (YOLO) and our implementation specific. That said, specifically
+     * the {@link inputImage} a particular bitmap encoding of an image.
+     */
+    detectFaces: (inputImage: Uint8Array) => Promise<Float32Array>;
+
+    /**
+     * Return a mobilefacenet embedding for the given face data.
+     *
+     * Both the input and output are opaque binary data whose internal structure
+     * is model (mobilefacenet) and our implementation specific.
+     */
+    faceEmbedding: (input: Float32Array) => Promise<Float32Array>;
+
     // - File selection
     // TODO: Deprecated - use dialogs on the renderer process itself