diff --git a/web/apps/photos/src/services/face/align.ts b/web/apps/photos/src/services/face/align.ts deleted file mode 100644 index e69de29bb..000000000 diff --git a/web/apps/photos/src/services/face/blur.ts b/web/apps/photos/src/services/face/blur.ts deleted file mode 100644 index c79081297..000000000 --- a/web/apps/photos/src/services/face/blur.ts +++ /dev/null @@ -1,187 +0,0 @@ -import { Face } from "services/face/types"; -import { createGrayscaleIntMatrixFromNormalized2List } from "utils/image"; -import { mobileFaceNetFaceSize } from "./embed"; - -/** - * Laplacian blur detection. - */ -export const detectBlur = ( - alignedFaces: Float32Array, - faces: Face[], -): number[] => { - const numFaces = Math.round( - alignedFaces.length / - (mobileFaceNetFaceSize * mobileFaceNetFaceSize * 3), - ); - const blurValues: number[] = []; - for (let i = 0; i < numFaces; i++) { - const face = faces[i]; - const direction = faceDirection(face); - const faceImage = createGrayscaleIntMatrixFromNormalized2List( - alignedFaces, - i, - ); - const laplacian = applyLaplacian(faceImage, direction); - blurValues.push(matrixVariance(laplacian)); - } - return blurValues; -}; - -type FaceDirection = "left" | "right" | "straight"; - -const faceDirection = (face: Face): FaceDirection => { - const landmarks = face.detection.landmarks; - const leftEye = landmarks[0]; - const rightEye = landmarks[1]; - const nose = landmarks[2]; - const leftMouth = landmarks[3]; - const rightMouth = landmarks[4]; - - const eyeDistanceX = Math.abs(rightEye.x - leftEye.x); - const eyeDistanceY = Math.abs(rightEye.y - leftEye.y); - const mouthDistanceY = Math.abs(rightMouth.y - leftMouth.y); - - const faceIsUpright = - Math.max(leftEye.y, rightEye.y) + 0.5 * eyeDistanceY < nose.y && - nose.y + 0.5 * mouthDistanceY < Math.min(leftMouth.y, rightMouth.y); - - const noseStickingOutLeft = - nose.x < Math.min(leftEye.x, rightEye.x) && - nose.x < Math.min(leftMouth.x, rightMouth.x); - - const noseStickingOutRight = - nose.x > Math.max(leftEye.x, rightEye.x) && - nose.x > Math.max(leftMouth.x, rightMouth.x); - - const noseCloseToLeftEye = - Math.abs(nose.x - leftEye.x) < 0.2 * eyeDistanceX; - const noseCloseToRightEye = - Math.abs(nose.x - rightEye.x) < 0.2 * eyeDistanceX; - - if (noseStickingOutLeft || (faceIsUpright && noseCloseToLeftEye)) { - return "left"; - } else if (noseStickingOutRight || (faceIsUpright && noseCloseToRightEye)) { - return "right"; - } - - return "straight"; -}; - -/** - * Return a new image by applying a Laplacian blur kernel to each pixel. - */ -const applyLaplacian = ( - image: number[][], - direction: FaceDirection, -): number[][] => { - const paddedImage: number[][] = padImage(image, direction); - const numRows = paddedImage.length - 2; - const numCols = paddedImage[0].length - 2; - - // Create an output image initialized to 0. - const outputImage: number[][] = Array.from({ length: numRows }, () => - new Array(numCols).fill(0), - ); - - // Define the Laplacian kernel. - const kernel: number[][] = [ - [0, 1, 0], - [1, -4, 1], - [0, 1, 0], - ]; - - // Apply the kernel to each pixel - for (let i = 0; i < numRows; i++) { - for (let j = 0; j < numCols; j++) { - let sum = 0; - for (let ki = 0; ki < 3; ki++) { - for (let kj = 0; kj < 3; kj++) { - sum += paddedImage[i + ki][j + kj] * kernel[ki][kj]; - } - } - // Adjust the output value if necessary (e.g., clipping). - outputImage[i][j] = sum; - } - } - - return outputImage; -}; - -const padImage = (image: number[][], direction: FaceDirection): number[][] => { - const removeSideColumns = 56; /* must be even */ - - const numRows = image.length; - const numCols = image[0].length; - const paddedNumCols = numCols + 2 - removeSideColumns; - const paddedNumRows = numRows + 2; - - // Create a new matrix with extra padding. - const paddedImage: number[][] = Array.from({ length: paddedNumRows }, () => - new Array(paddedNumCols).fill(0), - ); - - if (direction === "straight") { - // Copy original image into the center of the padded image. - for (let i = 0; i < numRows; i++) { - for (let j = 0; j < paddedNumCols - 2; j++) { - paddedImage[i + 1][j + 1] = - image[i][j + Math.round(removeSideColumns / 2)]; - } - } - } else if (direction === "left") { - // If the face is facing left, we only take the right side of the face image. - for (let i = 0; i < numRows; i++) { - for (let j = 0; j < paddedNumCols - 2; j++) { - paddedImage[i + 1][j + 1] = image[i][j + removeSideColumns]; - } - } - } else if (direction === "right") { - // If the face is facing right, we only take the left side of the face image. - for (let i = 0; i < numRows; i++) { - for (let j = 0; j < paddedNumCols - 2; j++) { - paddedImage[i + 1][j + 1] = image[i][j]; - } - } - } - - // Reflect padding - // Top and bottom rows - for (let j = 1; j <= paddedNumCols - 2; j++) { - paddedImage[0][j] = paddedImage[2][j]; // Top row - paddedImage[numRows + 1][j] = paddedImage[numRows - 1][j]; // Bottom row - } - // Left and right columns - for (let i = 0; i < numRows + 2; i++) { - paddedImage[i][0] = paddedImage[i][2]; // Left column - paddedImage[i][paddedNumCols - 1] = paddedImage[i][paddedNumCols - 3]; // Right column - } - - return paddedImage; -}; - -const matrixVariance = (matrix: number[][]): number => { - const numRows = matrix.length; - const numCols = matrix[0].length; - const totalElements = numRows * numCols; - - // Calculate the mean. - let mean: number = 0; - matrix.forEach((row) => { - row.forEach((value) => { - mean += value; - }); - }); - mean /= totalElements; - - // Calculate the variance. - let variance: number = 0; - matrix.forEach((row) => { - row.forEach((value) => { - const diff: number = value - mean; - variance += diff * diff; - }); - }); - variance /= totalElements; - - return variance; -}; diff --git a/web/apps/photos/src/services/face/f-index.ts b/web/apps/photos/src/services/face/f-index.ts index a48704171..9a2be2cee 100644 --- a/web/apps/photos/src/services/face/f-index.ts +++ b/web/apps/photos/src/services/face/f-index.ts @@ -1,27 +1,28 @@ import { openCache } from "@/next/blob-cache"; import log from "@/next/log"; -import { faceAlignment } from "services/face/align"; +import { Matrix } from "ml-matrix"; import mlIDbStorage from "services/face/db"; import { detectFaces, getRelativeDetection } from "services/face/detect"; import { faceEmbeddings, mobileFaceNetFaceSize } from "services/face/embed"; -import { Box, enlargeBox } from "services/face/geom"; +import { Box, Point, enlargeBox } from "services/face/geom"; import { DetectedFace, Face, + FaceAlignment, FaceCrop, FaceDetection, MLSyncFileContext, - type FaceAlignment, type MlFileData, } from "services/face/types"; import { defaultMLVersion } from "services/machineLearning/machineLearningService"; +import { getSimilarityTransformation } from "similarity-transformation"; import type { EnteFile } from "types/file"; import { + createGrayscaleIntMatrixFromNormalized2List, cropWithRotation, imageBitmapToBlob, warpAffineFloat32List, } from "utils/image"; -import { detectBlur } from "./blur"; import { fetchImageBitmap, fetchImageBitmapForContext, @@ -149,6 +150,275 @@ const syncFileFaceAlignments = async ( return faceImages; }; +// TODO-ML(MR): When is this used or is it as Blazeface leftover? +const ARCFACE_LANDMARKS = [ + [38.2946, 51.6963], + [73.5318, 51.5014], + [56.0252, 71.7366], + [56.1396, 92.2848], +] as Array<[number, number]>; + +const ARCFACE_LANDMARKS_FACE_SIZE = 112; + +const ARC_FACE_5_LANDMARKS = [ + [38.2946, 51.6963], + [73.5318, 51.5014], + [56.0252, 71.7366], + [41.5493, 92.3655], + [70.7299, 92.2041], +] as Array<[number, number]>; + +/** + * Compute and return an {@link FaceAlignment} for the given face detection. + * + * @param faceDetection A geometry indicating a face detected in an image. + */ +export const faceAlignment = (faceDetection: FaceDetection): FaceAlignment => { + const landmarkCount = faceDetection.landmarks.length; + return getFaceAlignmentUsingSimilarityTransform( + faceDetection, + normalizeLandmarks( + landmarkCount === 5 ? ARC_FACE_5_LANDMARKS : ARCFACE_LANDMARKS, + ARCFACE_LANDMARKS_FACE_SIZE, + ), + ); +}; + +function getFaceAlignmentUsingSimilarityTransform( + faceDetection: FaceDetection, + alignedLandmarks: Array<[number, number]>, +): FaceAlignment { + const landmarksMat = new Matrix( + faceDetection.landmarks + .map((p) => [p.x, p.y]) + .slice(0, alignedLandmarks.length), + ).transpose(); + const alignedLandmarksMat = new Matrix(alignedLandmarks).transpose(); + + const simTransform = getSimilarityTransformation( + landmarksMat, + alignedLandmarksMat, + ); + + const RS = Matrix.mul(simTransform.rotation, simTransform.scale); + const TR = simTransform.translation; + + const affineMatrix = [ + [RS.get(0, 0), RS.get(0, 1), TR.get(0, 0)], + [RS.get(1, 0), RS.get(1, 1), TR.get(1, 0)], + [0, 0, 1], + ]; + + const size = 1 / simTransform.scale; + const meanTranslation = simTransform.toMean.sub(0.5).mul(size); + const centerMat = simTransform.fromMean.sub(meanTranslation); + const center = new Point(centerMat.get(0, 0), centerMat.get(1, 0)); + const rotation = -Math.atan2( + simTransform.rotation.get(0, 1), + simTransform.rotation.get(0, 0), + ); + + return { + affineMatrix, + center, + size, + rotation, + }; +} + +function normalizeLandmarks( + landmarks: Array<[number, number]>, + faceSize: number, +): Array<[number, number]> { + return landmarks.map((landmark) => + landmark.map((p) => p / faceSize), + ) as Array<[number, number]>; +} + +/** + * Laplacian blur detection. + */ +export const detectBlur = ( + alignedFaces: Float32Array, + faces: Face[], +): number[] => { + const numFaces = Math.round( + alignedFaces.length / + (mobileFaceNetFaceSize * mobileFaceNetFaceSize * 3), + ); + const blurValues: number[] = []; + for (let i = 0; i < numFaces; i++) { + const face = faces[i]; + const direction = faceDirection(face); + const faceImage = createGrayscaleIntMatrixFromNormalized2List( + alignedFaces, + i, + ); + const laplacian = applyLaplacian(faceImage, direction); + blurValues.push(matrixVariance(laplacian)); + } + return blurValues; +}; + +type FaceDirection = "left" | "right" | "straight"; + +const faceDirection = (face: Face): FaceDirection => { + const landmarks = face.detection.landmarks; + const leftEye = landmarks[0]; + const rightEye = landmarks[1]; + const nose = landmarks[2]; + const leftMouth = landmarks[3]; + const rightMouth = landmarks[4]; + + const eyeDistanceX = Math.abs(rightEye.x - leftEye.x); + const eyeDistanceY = Math.abs(rightEye.y - leftEye.y); + const mouthDistanceY = Math.abs(rightMouth.y - leftMouth.y); + + const faceIsUpright = + Math.max(leftEye.y, rightEye.y) + 0.5 * eyeDistanceY < nose.y && + nose.y + 0.5 * mouthDistanceY < Math.min(leftMouth.y, rightMouth.y); + + const noseStickingOutLeft = + nose.x < Math.min(leftEye.x, rightEye.x) && + nose.x < Math.min(leftMouth.x, rightMouth.x); + + const noseStickingOutRight = + nose.x > Math.max(leftEye.x, rightEye.x) && + nose.x > Math.max(leftMouth.x, rightMouth.x); + + const noseCloseToLeftEye = + Math.abs(nose.x - leftEye.x) < 0.2 * eyeDistanceX; + const noseCloseToRightEye = + Math.abs(nose.x - rightEye.x) < 0.2 * eyeDistanceX; + + if (noseStickingOutLeft || (faceIsUpright && noseCloseToLeftEye)) { + return "left"; + } else if (noseStickingOutRight || (faceIsUpright && noseCloseToRightEye)) { + return "right"; + } + + return "straight"; +}; + +/** + * Return a new image by applying a Laplacian blur kernel to each pixel. + */ +const applyLaplacian = ( + image: number[][], + direction: FaceDirection, +): number[][] => { + const paddedImage: number[][] = padImage(image, direction); + const numRows = paddedImage.length - 2; + const numCols = paddedImage[0].length - 2; + + // Create an output image initialized to 0. + const outputImage: number[][] = Array.from({ length: numRows }, () => + new Array(numCols).fill(0), + ); + + // Define the Laplacian kernel. + const kernel: number[][] = [ + [0, 1, 0], + [1, -4, 1], + [0, 1, 0], + ]; + + // Apply the kernel to each pixel + for (let i = 0; i < numRows; i++) { + for (let j = 0; j < numCols; j++) { + let sum = 0; + for (let ki = 0; ki < 3; ki++) { + for (let kj = 0; kj < 3; kj++) { + sum += paddedImage[i + ki][j + kj] * kernel[ki][kj]; + } + } + // Adjust the output value if necessary (e.g., clipping). + outputImage[i][j] = sum; + } + } + + return outputImage; +}; + +const padImage = (image: number[][], direction: FaceDirection): number[][] => { + const removeSideColumns = 56; /* must be even */ + + const numRows = image.length; + const numCols = image[0].length; + const paddedNumCols = numCols + 2 - removeSideColumns; + const paddedNumRows = numRows + 2; + + // Create a new matrix with extra padding. + const paddedImage: number[][] = Array.from({ length: paddedNumRows }, () => + new Array(paddedNumCols).fill(0), + ); + + if (direction === "straight") { + // Copy original image into the center of the padded image. + for (let i = 0; i < numRows; i++) { + for (let j = 0; j < paddedNumCols - 2; j++) { + paddedImage[i + 1][j + 1] = + image[i][j + Math.round(removeSideColumns / 2)]; + } + } + } else if (direction === "left") { + // If the face is facing left, we only take the right side of the face image. + for (let i = 0; i < numRows; i++) { + for (let j = 0; j < paddedNumCols - 2; j++) { + paddedImage[i + 1][j + 1] = image[i][j + removeSideColumns]; + } + } + } else if (direction === "right") { + // If the face is facing right, we only take the left side of the face image. + for (let i = 0; i < numRows; i++) { + for (let j = 0; j < paddedNumCols - 2; j++) { + paddedImage[i + 1][j + 1] = image[i][j]; + } + } + } + + // Reflect padding + // Top and bottom rows + for (let j = 1; j <= paddedNumCols - 2; j++) { + paddedImage[0][j] = paddedImage[2][j]; // Top row + paddedImage[numRows + 1][j] = paddedImage[numRows - 1][j]; // Bottom row + } + // Left and right columns + for (let i = 0; i < numRows + 2; i++) { + paddedImage[i][0] = paddedImage[i][2]; // Left column + paddedImage[i][paddedNumCols - 1] = paddedImage[i][paddedNumCols - 3]; // Right column + } + + return paddedImage; +}; + +const matrixVariance = (matrix: number[][]): number => { + const numRows = matrix.length; + const numCols = matrix[0].length; + const totalElements = numRows * numCols; + + // Calculate the mean. + let mean: number = 0; + matrix.forEach((row) => { + row.forEach((value) => { + mean += value; + }); + }); + mean /= totalElements; + + // Calculate the variance. + let variance: number = 0; + matrix.forEach((row) => { + row.forEach((value) => { + const diff: number = value - mean; + variance += diff * diff; + }); + }); + variance /= totalElements; + + return variance; +}; + const syncFileFaceEmbeddings = async ( fileContext: MLSyncFileContext, alignedFacesInput: Float32Array,