dedupe by file hash

This commit is contained in:
Rushikesh Tote 2022-05-19 13:01:04 +05:30
parent 12d341b1be
commit dad5e57e11
5 changed files with 113 additions and 14 deletions

View file

@ -17,6 +17,12 @@ const VerticalLine = styled.div`
background: #303030;
`;
const CheckboxText = styled.div`
margin-left: 0.5em;
font-size: 16px;
margin-right: 0.8em;
`;
interface IProps {
deleteFileHelper: () => void;
setDialogMessage: SetDialogMessage;
@ -55,7 +61,27 @@ export default function DeduplicateOptions({
{count} {constants.SELECTED}
</div>
</SelectionContainer>
<input
type="checkbox"
style={{
width: '1em',
height: '1em',
}}
value={
deduplicateContext.clubSameFileHashesOnly ? 'true' : 'false'
}
onChange={() => {
deduplicateContext.setClubSameFileHashesOnly(
!deduplicateContext.clubSameFileHashesOnly
);
}}></input>
<CheckboxText>{constants.CLUB_BY_FILE_HASH}</CheckboxText>
<div
style={{
marginRight: '14px',
}}>
<VerticalLine />
</div>
<input
type="checkbox"
style={{
@ -70,14 +96,7 @@ export default function DeduplicateOptions({
!deduplicateContext.clubSameTimeFilesOnly
);
}}></input>
<div
style={{
marginLeft: '0.5em',
fontSize: '16px',
marginRight: '0.8em',
}}>
{constants.CLUB_BY_CAPTURE_TIME}
</div>
<CheckboxText>{constants.CLUB_BY_CAPTURE_TIME}</CheckboxText>
<div>
<VerticalLine />
</div>

View file

@ -6,6 +6,7 @@ import React, { createContext, useContext, useEffect, useState } from 'react';
import {
getDuplicateFiles,
clubDuplicatesByTime,
clubDuplicatesBySameFileHashes,
} from 'services/deduplicationService';
import { syncFiles, trashFiles } from 'services/fileService';
import { EnteFile } from 'types/file';
@ -43,6 +44,7 @@ export default function Deduplicate() {
} = useContext(AppContext);
const [duplicateFiles, setDuplicateFiles] = useState<EnteFile[]>(null);
const [clubSameTimeFilesOnly, setClubSameTimeFilesOnly] = useState(false);
const [clubSameFileHashesOnly, setClubSameFileHashesOnly] = useState(false);
const [fileSizeMap, setFileSizeMap] = useState(new Map<number, number>());
const [collectionNameMap, setCollectionNameMap] = useState(
new Map<number, string>()
@ -67,7 +69,7 @@ export default function Deduplicate() {
useEffect(() => {
syncWithRemote();
}, [clubSameTimeFilesOnly]);
}, [clubSameTimeFilesOnly, clubSameFileHashesOnly]);
const syncWithRemote = async () => {
startLoading();
@ -79,9 +81,15 @@ export default function Deduplicate() {
setCollectionNameMap(collectionNameMap);
const files = await syncFiles(collections, () => null);
let duplicates = await getDuplicateFiles(files, collectionNameMap);
if (clubSameTimeFilesOnly) {
duplicates = clubDuplicatesByTime(duplicates);
}
if (clubSameFileHashesOnly) {
duplicates = clubDuplicatesBySameFileHashes(duplicates);
}
const currFileSizeMap = new Map<number, number>();
let allDuplicateFiles: EnteFile[] = [];
let toSelectFileIDs: number[] = [];
@ -149,13 +157,16 @@ export default function Deduplicate() {
collectionNameMap,
clubSameTimeFilesOnly,
setClubSameTimeFilesOnly,
clubSameFileHashesOnly,
setClubSameFileHashesOnly,
fileSizeMap,
isOnDeduplicatePage: true,
}}>
{duplicateFiles.length > 0 && (
<Info>
{constants.DEDUPLICATION_LOGIC_MESSAGE(
clubSameTimeFilesOnly
clubSameTimeFilesOnly,
clubSameFileHashesOnly
)}
</Info>
)}

View file

@ -2,6 +2,7 @@ import { EnteFile } from 'types/file';
import { getEndpoint } from 'utils/common/apiUtil';
import { getToken } from 'utils/common/key';
import { logError } from 'utils/sentry';
import { areFilesWithFileHashSame, fileHashExists } from 'utils/upload';
import HTTPService from './HTTPService';
const ENDPOINT = getEndpoint();
@ -113,6 +114,64 @@ export function clubDuplicatesByTime(dupes: DuplicateFiles[]) {
return result;
}
export function clubDuplicatesBySameFileHashes(dupes: DuplicateFiles[]) {
const result: DuplicateFiles[] = [];
for (const dupe of dupes) {
let files: EnteFile[] = [];
const filteredFiles = dupe.files.filter((file) => {
return fileHashExists(file.metadata);
});
if (filteredFiles.length <= 1) {
continue;
}
const dupesSortedByFileHash = filteredFiles.map((file) => {
return {
file,
hash:
file.metadata.hash ??
`${file.metadata.imageHash}_${file.metadata.videoHash}`,
};
});
dupesSortedByFileHash.sort((firstFile, secondFile) => {
return firstFile.hash.localeCompare(secondFile.hash);
});
files.push(dupesSortedByFileHash[0].file);
for (let i = 1; i < dupesSortedByFileHash.length; i++) {
if (
areFilesWithFileHashSame(
dupesSortedByFileHash[i - 1].file.metadata,
dupesSortedByFileHash[i].file.metadata
)
) {
files.push(dupesSortedByFileHash[i].file);
} else {
if (files.length > 1) {
result.push({
files: [...files],
size: dupe.size,
});
}
files = [dupesSortedByFileHash[i].file];
}
}
if (files.length > 1) {
result.push({
files,
size: dupe.size,
});
}
}
return result;
}
async function fetchDuplicateFileIDs() {
try {
const response = await HTTPService.get(

View file

@ -1,6 +1,8 @@
export type DeduplicateContextType = {
clubSameTimeFilesOnly: boolean;
setClubSameTimeFilesOnly: (clubSameTimeFilesOnly: boolean) => void;
clubSameFileHashesOnly: boolean;
setClubSameFileHashesOnly: (clubSameFileHashes: boolean) => void;
fileSizeMap: Map<number, number>;
isOnDeduplicatePage: boolean;
collectionNameMap: Map<number, string>;
@ -9,6 +11,8 @@ export type DeduplicateContextType = {
export const DefaultDeduplicateContext = {
clubSameTimeFilesOnly: false,
setClubSameTimeFilesOnly: () => null,
clubSameFileHashesOnly: false,
setClubSameFileHashesOnly: () => null,
fileSizeMap: new Map<number, number>(),
isOnDeduplicatePage: false,
collectionNameMap: new Map<number, string>(),

View file

@ -721,13 +721,19 @@ const englishConstants = {
DEDUPLICATE_FILES: 'deduplicate files',
NO_DUPLICATES_FOUND: "you've no duplicate files that can be cleared",
CLUB_BY_CAPTURE_TIME: 'club by capture time',
CLUB_BY_FILE_HASH: 'club by file hashes',
FILES: 'files',
EACH: 'each',
DEDUPLICATION_LOGIC_MESSAGE: (captureTime: boolean) => (
DEDUPLICATION_LOGIC_MESSAGE: (
captureTime: boolean,
fileHashes: boolean
) => (
<>
the following files were clubbed based on their sizes
{captureTime && ` and capture time`}, please review and delete items
you believe are duplicates{' '}
{captureTime && !fileHashes && ' and capture time'}
{fileHashes && !captureTime && ' and file hashes'}
{fileHashes && captureTime && ', capture time and file hashes'},
please review and delete items you believe are duplicates{' '}
</>
),
STOP_ALL_UPLOADS_MESSAGE: