WIP IPC API

2024-04-11 13:58:52 +05:30 · 2024-04-11 13:58:52 +05:30 · a88f551b6a
parent 2bb9e77e34
commit a88f551b6a
6 changed files with 79 additions and 94 deletions
--- a/desktop/src/main/services/ml-face.ts
+++ b/desktop/src/main/services/ml-face.ts
@ -78,6 +78,30 @@ const faceEmbeddingSession = async () => {
    return _faceEmbeddingSession;
 };

+private async initOnnx() {
+    console.log("start ort");
+    this.onnxInferenceSession = await ort.InferenceSession.create(
+        "/models/yoloface/yolov5s_face_640_640_dynamic.onnx",
+    );
+    const data = new Float32Array(1 * 3 * 640 * 640);
+    const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
+    // TODO(MR): onnx-yolo
+    // const feeds: Record<string, ort.Tensor> = {};
+    const feeds: Record<string, any> = {};
+    const name = this.onnxInferenceSession.inputNames[0];
+    feeds[name] = inputTensor;
+    await this.onnxInferenceSession.run(feeds);
+    console.log("start end");
+}
+
+private async getOnnxInferenceSession() {
+    if (!this.onnxInferenceSession) {
+        await this.initOnnx();
+    }
+    return this.onnxInferenceSession;
+}
+
+
 // export const clipImageEmbedding = async (jpegImageData: Uint8Array) => {
 //     const tempFilePath = await generateTempFilePath("");
 //     const imageStream = new Response(jpegImageData.buffer).body;
--- a/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts
+++ b/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts
@ -203,9 +203,6 @@ export class LocalMLSyncContext implements MLSyncContext {
    }

    public async dispose() {
-        // await this.faceDetectionService.dispose();
-        // await this.faceEmbeddingService.dispose();
-
        this.localFilesMap = undefined;
        await this.syncQueue.onIdle();
        this.syncQueue.removeAllListeners();
--- a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
+++ b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
@ -96,12 +96,6 @@ class MobileFaceNetEmbeddingService implements FaceEmbeddingService {
        }
        return embeddings;
    }
-
-    public async dispose() {
-        const inferenceSession = await this.getOnnxInferenceSession();
-        inferenceSession?.release();
-        this.onnxInferenceSession = undefined;
-    }
 }

 export default new MobileFaceNetEmbeddingService();
--- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
+++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
@ -1,4 +1,5 @@
 import { MAX_FACE_DISTANCE_PERCENT } from "constants/mlConfig";
+import { euclidean } from "hdbscan";
 import {
    Matrix,
    applyToPoint,
@ -21,17 +22,7 @@ import {
 import { newBox } from "utils/machineLearning";
 import { Box, Point } from "../../../thirdparty/face-api/classes";

-// TODO(MR): onnx-yolo
-// import * as ort from "onnxruntime-web";
-// import { env } from "onnxruntime-web";
-const ort: any = {};
-
-// TODO(MR): onnx-yolo
-// env.wasm.wasmPaths = "/js/onnx/";
 class YoloFaceDetectionService implements FaceDetectionService {
-    // TODO(MR): onnx-yolo
-    // private onnxInferenceSession?: ort.InferenceSession;
-    private onnxInferenceSession?: any;
    public method: Versioned<FaceDetectionMethod>;

    public constructor() {
@ -41,27 +32,44 @@ class YoloFaceDetectionService implements FaceDetectionService {
        };
    }

-    private async initOnnx() {
-        console.log("start ort");
-        this.onnxInferenceSession = await ort.InferenceSession.create(
-            "/models/yoloface/yolov5s_face_640_640_dynamic.onnx",
-        );
-        const data = new Float32Array(1 * 3 * 640 * 640);
+    public async detectFaces(
+        imageBitmap: ImageBitmap,
+    ): Promise<Array<FaceDetection>> {
+        const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT;
+        const preprocessResult =
+            this.preprocessImageBitmapToFloat32ChannelsFirst(
+                imageBitmap,
+                640,
+                640,
+            );
+        const data = preprocessResult.data;
+        const resized = preprocessResult.newSize;
        const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
        // TODO(MR): onnx-yolo
        // const feeds: Record<string, ort.Tensor> = {};
        const feeds: Record<string, any> = {};
-        const name = this.onnxInferenceSession.inputNames[0];
-        feeds[name] = inputTensor;
-        await this.onnxInferenceSession.run(feeds);
-        console.log("start end");
-    }
-
-    private async getOnnxInferenceSession() {
-        if (!this.onnxInferenceSession) {
-            await this.initOnnx();
-        }
-        return this.onnxInferenceSession;
+        feeds["input"] = inputTensor;
+        const inferenceSession = await this.getOnnxInferenceSession();
+        const runout = await inferenceSession.run(feeds);
+        const outputData = runout.output.data;
+        const faces = this.getFacesFromYoloOutput(
+            outputData as Float32Array,
+            0.7,
+        );
+        const inBox = newBox(0, 0, resized.width, resized.height);
+        const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height);
+        const transform = computeTransformToBox(inBox, toBox);
+        const faceDetections: Array<FaceDetection> = faces?.map((f) => {
+            const box = transformBox(f.box, transform);
+            const normLandmarks = f.landmarks;
+            const landmarks = transformPoints(normLandmarks, transform);
+            return {
+                box,
+                landmarks,
+                probability: f.probability as number,
+            } as FaceDetection;
+        });
+        return removeDuplicateDetections(faceDetections, maxFaceDistance);
    }

    private preprocessImageBitmapToFloat32ChannelsFirst(
@ -233,64 +241,10 @@ class YoloFaceDetectionService implements FaceDetectionService {
            probability: faceDetection.probability,
        };
    }
-
-    private async estimateOnnx(imageBitmap: ImageBitmap) {
-        const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT;
-        const preprocessResult =
-            this.preprocessImageBitmapToFloat32ChannelsFirst(
-                imageBitmap,
-                640,
-                640,
-            );
-        const data = preprocessResult.data;
-        const resized = preprocessResult.newSize;
-        const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
-        // TODO(MR): onnx-yolo
-        // const feeds: Record<string, ort.Tensor> = {};
-        const feeds: Record<string, any> = {};
-        feeds["input"] = inputTensor;
-        const inferenceSession = await this.getOnnxInferenceSession();
-        const runout = await inferenceSession.run(feeds);
-        const outputData = runout.output.data;
-        const faces = this.getFacesFromYoloOutput(
-            outputData as Float32Array,
-            0.7,
-        );
-        const inBox = newBox(0, 0, resized.width, resized.height);
-        const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height);
-        const transform = computeTransformToBox(inBox, toBox);
-        const faceDetections: Array<FaceDetection> = faces?.map((f) => {
-            const box = transformBox(f.box, transform);
-            const normLandmarks = f.landmarks;
-            const landmarks = transformPoints(normLandmarks, transform);
-            return {
-                box,
-                landmarks,
-                probability: f.probability as number,
-            } as FaceDetection;
-        });
-        return removeDuplicateDetections(faceDetections, maxFaceDistance);
-    }
-
-    public async detectFaces(
-        imageBitmap: ImageBitmap,
-    ): Promise<Array<FaceDetection>> {
-        // measure time taken
-        const facesFromOnnx = await this.estimateOnnx(imageBitmap);
-        return facesFromOnnx;
-    }
-
-    public async dispose() {
-        const inferenceSession = await this.getOnnxInferenceSession();
-        inferenceSession?.release();
-        this.onnxInferenceSession = undefined;
-    }
 }

 export default new YoloFaceDetectionService();

-import { euclidean } from "hdbscan";
-
 /**
 * Removes duplicate face detections from an array of detections.
 *
--- a/web/apps/photos/src/types/machineLearning/index.ts
+++ b/web/apps/photos/src/types/machineLearning/index.ts
@ -261,13 +261,12 @@ export declare type MLIndex = "files" | "people";

 export interface FaceDetectionService {
    method: Versioned<FaceDetectionMethod>;
-    // init(): Promise<void>;
+
    detectFaces(image: ImageBitmap): Promise<Array<FaceDetection>>;
    getRelativeDetection(
        faceDetection: FaceDetection,
        imageDimensions: Dimensions,
    ): FaceDetection;
-    dispose(): Promise<void>;
 }

 export interface FaceCropService {
--- a/web/packages/next/types/ipc.ts
+++ b/web/packages/next/types/ipc.ts
@ -196,7 +196,7 @@ export interface Electron {
    // - ML

    /**
-     * Compute and return a CLIP embedding of the given image.
+     * Return a CLIP embedding of the given image.
     *
     * See: [Note: CLIP based magic search]
     *
@ -207,7 +207,7 @@ export interface Electron {
    clipImageEmbedding: (jpegImageData: Uint8Array) => Promise<Float32Array>;

    /**
-     * Compute and return a CLIP embedding of the given image.
+     * Return a CLIP embedding of the given image.
     *
     * See: [Note: CLIP based magic search]
     *
@ -217,6 +217,23 @@ export interface Electron {
     */
    clipTextEmbedding: (text: string) => Promise<Float32Array>;

+    /**
+     * Detect faces in the given image using YOLO.
+     *
+     * Both the input and output are opaque binary data whose internal structure
+     * is model (YOLO) and our implementation specific. That said, specifically
+     * the {@link inputImage} a particular bitmap encoding of an image.
+     */
+    detectFaces: (inputImage: Uint8Array) => Promise<Float32Array>;
+
+    /**
+     * Return a mobilefacenet embedding for the given face data.
+     *
+     * Both the input and output are opaque binary data whose internal structure
+     * is model (mobilefacenet) and our implementation specific.
+     */
+    faceEmbedding: (input: Float32Array) => Promise<Float32Array>;
+
    // - File selection
    // TODO: Deprecated - use dialogs on the renderer process itself