diff --git a/.eslintignore b/.eslintignore
index 36c33a59..1e9dfc58 100644
--- a/.eslintignore
+++ b/.eslintignore
@@ -1,3 +1,3 @@
src/core/lib/**
!src/core/lib/Magic.js
-src/core/config/MetaConfig.js
\ No newline at end of file
+src/core/config/MetaConfig.js
diff --git a/package-lock.json b/package-lock.json
index 4bcf071c..a008ca2b 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1404,6 +1404,14 @@
"supports-color": "2.0.0"
}
},
+ "chi-squared": {
+ "version": "1.1.0",
+ "resolved": "https://registry.npmjs.org/chi-squared/-/chi-squared-1.1.0.tgz",
+ "integrity": "sha1-iShlz/qOCnIPkhv8nGNcGawqNG0=",
+ "requires": {
+ "gamma": "1.0.0"
+ }
+ },
"chokidar": {
"version": "1.7.0",
"resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz",
@@ -4255,6 +4263,11 @@
"integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=",
"dev": true
},
+ "gamma": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/gamma/-/gamma-1.0.0.tgz",
+ "integrity": "sha1-mDwck5/iPZMnAVhXEeHZpDDLdMs="
+ },
"get-caller-file": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.2.tgz",
diff --git a/package.json b/package.json
index 46450b06..b71ff92d 100644
--- a/package.json
+++ b/package.json
@@ -72,6 +72,7 @@
"bootstrap": "^3.3.7",
"bootstrap-colorpicker": "^2.5.2",
"bootstrap-switch": "^3.3.4",
+ "chi-squared": "^1.1.0",
"crypto-api": "^0.7.5",
"crypto-js": "^3.1.9-1",
"diff": "^3.4.0",
diff --git a/src/core/FlowControl.js b/src/core/FlowControl.js
index b9eff7f0..059cdebc 100755
--- a/src/core/FlowControl.js
+++ b/src/core/FlowControl.js
@@ -278,8 +278,7 @@ const FlowControl = {
Recipe (click to load) |
Data snippet |
- Most likely language\n(lower scores are better) |
- File type |
+ Properties |
`;
options.forEach(option => {
@@ -290,20 +289,25 @@ const FlowControl = {
.concat(currentRecipeConfig.slice(state.progress + 1)),
recipeURL = "recipe=" + Utils.encodeURIFragment(Utils.generatePrettyRecipe(recipeConfig));
- const language = option.languageScores[0];
- let fileType = "Unknown";
+ const bestLanguage = option.languageScores[0];
+ let language = "Unknown",
+ fileType = "Unknown";
+
+ if (bestLanguage.probability > 0.00005) {
+ language = Magic.codeToLanguage(bestLanguage.lang) + " " +
+ (bestLanguage.probability * 100).toFixed(2) + "%";
+ }
if (option.fileType) {
- fileType = `Extension: ${option.fileType.ext}\nMime type: ${option.fileType.mime}`;
- if (option.fileType.desc)
- fileType += `\nDescription: ${option.fileType.desc}`;
+ fileType = `${option.fileType.mime} (${option.fileType.ext})`;
}
output += `
${Utils.generatePrettyRecipe(option.recipe, true)} |
${Utils.escapeHtml(Utils.printable(Utils.truncate(option.data, 99)))} |
- ${Magic.codeToLanguage(language.lang)}\nScore: ${language.chiSqr.toFixed()} |
- ${fileType} |
+ Language: ${language}
+File type: ${fileType}
+Valid UTF8: ${option.isUTF8} |
`;
});
diff --git a/src/core/lib/Magic.js b/src/core/lib/Magic.js
index 2e29cd0b..b93ad2ec 100644
--- a/src/core/lib/Magic.js
+++ b/src/core/lib/Magic.js
@@ -3,6 +3,7 @@ import Utils from "../Utils.js";
import Recipe from "../Recipe.js";
import Dish from "../Dish.js";
import FileType from "../operations/FileType.js";
+import chiSquared from "chi-squared";
/**
@@ -19,11 +20,12 @@ class Magic {
* Magic constructor.
*
* @param {ArrayBuffer} buf
+ * @param {Object[]} [opPatterns]
*/
- constructor(buf) {
+ constructor(buf, opPatterns) {
this.inputBuffer = new Uint8Array(buf);
this.inputStr = Utils.arrayBufferToStr(buf);
- this.opPatterns = Magic._generateOpPatterns();
+ this.opPatterns = opPatterns || Magic._generateOpPatterns();
}
/**
@@ -58,15 +60,17 @@ class Magic {
let chiSqrs = [];
for (let lang in LANG_FREQS) {
+ let [score, prob] = Magic._chiSqr(inputFreq, LANG_FREQS[lang]);
chiSqrs.push({
lang: lang,
- chiSqr: Magic._chiSqr(inputFreq, LANG_FREQS[lang])
+ score: score,
+ probability: prob
});
}
// Sort results so that the most likely match is at the top
chiSqrs.sort((a, b) => {
- return a.chiSqr - b.chiSqr;
+ return a.score - b.score;
});
return chiSqrs;
@@ -84,6 +88,81 @@ class Magic {
return FileType.magicType(this.inputBuffer);
}
+ /**
+ * Detects whether the input buffer is valid UTF8.
+ *
+ * @returns {boolean}
+ */
+ isUTF8() {
+ const bytes = new Uint8Array(this.inputBuffer);
+ let i = 0;
+ while (i < bytes.length) {
+ if (( // ASCII
+ bytes[i] === 0x09 ||
+ bytes[i] === 0x0A ||
+ bytes[i] === 0x0D ||
+ (0x20 <= bytes[i] && bytes[i] <= 0x7E)
+ )) {
+ i += 1;
+ continue;
+ }
+
+ if (( // non-overlong 2-byte
+ (0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
+ (0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
+ )) {
+ i += 2;
+ continue;
+ }
+
+ if (( // excluding overlongs
+ bytes[i] === 0xE0 &&
+ (0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
+ (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
+ ) ||
+ ( // straight 3-byte
+ ((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
+ bytes[i] === 0xEE ||
+ bytes[i] === 0xEF) &&
+ (0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
+ (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
+ ) ||
+ ( // excluding surrogates
+ bytes[i] === 0xED &&
+ (0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
+ (0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
+ )) {
+ i += 3;
+ continue;
+ }
+
+ if (( // planes 1-3
+ bytes[i] === 0xF0 &&
+ (0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
+ (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
+ (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
+ ) ||
+ ( // planes 4-15
+ (0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
+ (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
+ (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
+ (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
+ ) ||
+ ( // plane 16
+ bytes[i] === 0xF4 &&
+ (0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
+ (0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
+ (0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
+ )) {
+ i += 4;
+ continue;
+ }
+
+ return false;
+ }
+
+ return true;
+ }
/**
* Speculatively executes matching operations, recording metadata of each result.
@@ -103,6 +182,7 @@ class Magic {
data: this.inputStr.slice(0, 100),
languageScores: this.detectLanguage(),
fileType: this.detectFileType(),
+ isUTF8: this.isUTF8()
});
// Find any operations that can be run on this data
@@ -122,7 +202,7 @@ class Magic {
const recipe = new Recipe([opConfig]);
await recipe.execute(dish, 0);
- const magic = new Magic(dish.get(Dish.ARRAY_BUFFER)),
+ const magic = new Magic(dish.get(Dish.ARRAY_BUFFER), this.opPatterns),
speculativeResults = await magic.speculativeExecution(depth-1, [...recipeConfig, opConfig]);
results = results.concat(speculativeResults);
@@ -131,13 +211,17 @@ class Magic {
// Return a sorted list of possible recipes along with their properties
return results.sort((a, b) => {
// Each option is sorted based on its most likely language (lower is better)
- let aScore = a.languageScores[0].chiSqr,
- bScore = b.languageScores[0].chiSqr;
+ let aScore = a.languageScores[0].score,
+ bScore = b.languageScores[0].score;
// If a recipe results in a file being detected, it receives a relatively good score
if (a.fileType) aScore = 500;
if (b.fileType) bScore = 500;
+ // If the result is valid UTF8, its score gets boosted (lower being better)
+ if (a.isUTF8) aScore -= 100;
+ if (b.isUTF8) bScore -= 100;
+
return aScore - bScore;
});
}
@@ -194,19 +278,24 @@ class Magic {
* https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
*
* @private
- * @param {number[]} observed
- * @param {number[]} expected
- * @returns {number}
+ * @param {number[]} observed
+ * @param {number[]} expected
+ * @param {number} ddof - Delta degrees of freedom
+ * @returns {number[]} - The score and the probability
*/
- static _chiSqr(observed, expected) {
+ static _chiSqr(observed, expected, ddof=0) {
let tmp,
- res = 0;
+ score = 0;
for (let i = 0; i < observed.length; i++) {
tmp = observed[i] - expected[i];
- res += tmp * tmp / expected[i];
+ score += tmp * tmp / expected[i];
}
- return res;
+
+ return [
+ score,
+ 1 - chiSquared.cdf(score, observed.length - 1 - ddof)
+ ];
}
/**