Replaced my homebrew keyword generator

It now uses the one found here: https://github.com/Donatello-za/rake-php-plus
This is much better than the one I had hacked together.

Makes AntCMS a bit bigger.. but not by too much. I may end up removing the keyword generator outright, but for now I'm going to keep it.
This commit is contained in:
Belle Aerni 2023-01-14 00:49:21 -08:00
parent 91395db9c4
commit 3353be4920
3 changed files with 66 additions and 38 deletions

View file

@ -14,7 +14,8 @@
"league/commonmark": "^2.3",
"elgigi/commonmark-emoji": "^2.0",
"twig/twig": "^3.5",
"shapecode/twig-string-loader": "^1.1"
"shapecode/twig-string-loader": "^1.1",
"donatello-za/rake-php-plus": "^1.0"
},
"authors": [
{

62
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "cc1e950196a545bb666399f2d1eab986",
"content-hash": "4c1c73dcfd1b9e69aa3b18324c390ac3",
"packages": [
{
"name": "dflydev/dot-access-data",
@ -81,6 +81,66 @@
},
"time": "2022-10-27T11:44:00+00:00"
},
{
"name": "donatello-za/rake-php-plus",
"version": "v1.0.18",
"source": {
"type": "git",
"url": "https://github.com/Donatello-za/rake-php-plus.git",
"reference": "e9e9c0862b3dc953d288e8f42c76e4ceaeca0619"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Donatello-za/rake-php-plus/zipball/e9e9c0862b3dc953d288e8f42c76e4ceaeca0619",
"reference": "e9e9c0862b3dc953d288e8f42c76e4ceaeca0619",
"shasum": ""
},
"require": {
"ext-json": "*",
"ext-mbstring": "*",
"php": ">=5.4.0"
},
"require-dev": {
"php": ">=5.5.0",
"phpunit/phpunit": "~4.0|~5.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.0.13-dev"
}
},
"autoload": {
"psr-4": {
"DonatelloZa\\RakePlus\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Don Schoeman",
"email": "ta.maximus@gmail.com"
}
],
"description": "Yet another PHP implementation of the Rapid Automatic Keyword Extraction algorithm (RAKE).",
"homepage": "https://github.com/Donatello-za/rake-php-plus",
"keywords": [
"Algorithm",
"automatic",
"extraction",
"keyword",
"rake",
"rapid"
],
"support": {
"issues": "https://github.com/Donatello-za/rake-php-plus/issues",
"source": "https://github.com/Donatello-za/rake-php-plus"
},
"time": "2022-02-23T18:42:03+00:00"
},
{
"name": "elgigi/commonmark-emoji",
"version": "2.0.0",

View file

@ -4,6 +4,7 @@ namespace AntCMS;
use AntCMS\AntCache;
use AntCMS\AntConfig;
use DonatelloZa\RakePlus\RakePlus;
class AntKeywords
{
@ -29,42 +30,8 @@ class AntKeywords
}
}
// A bunch of characters we don't want to use for keyword generation
$stopWords = array(' a ', ' an ', ' and ', ' are ', ' as ', ' at ', ' be ', ' by ', ' for ', ' from ', ' has ', ' have ', ' in ', ' is ', ' it ', ' its ', ' of ', ' on ', ' that ', ' the ', ' to ', ' was ', ' were ', ' will ', ' with ');
$symbols = array('$', '€', '£', '¥', 'CHF', '₹', '+', '-', '×', '÷', '=', '>', '<', '.', ',', ';', ':', '!', '?', '"', '\'', '(', ')', '[', ']', '{', '}', '©', '™', '°', '§', '¶', '•', '_', '/');
$markdownSymbols = array('#', '##', '###', '####', '#####', '~~', '__', '**', '`', '``', '```', '*', '+', '>', '[', ']', '(', ')', '!', '&', '|');
$numbers = array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9');
$commonPronouns = array('he', 'him', 'his', 'she', 'her', 'hers', 'they', 'them', 'theirs');
//Strip the aforementioned characters away
$content = strtolower($content);
$content = str_replace($stopWords, ' ', $content);
$content = str_replace($symbols, ' ', $content);
$content = str_replace($markdownSymbols, ' ', $content);
$content = str_replace($numbers, ' ', $content);
$content = str_replace($commonPronouns, ' ', $content);
//Convert to an arrays
$words = explode(' ', $content);
// Remove newlines
$words = array_map(function ($key) {
return preg_replace('~[\r\n]+~', ' ', $key);
}, $words);
// Handle potentially empty keys
$words = array_filter($words);
// Then finally we count and sort the keywords, returning the top ones
$word_counts = array_count_values($words);
arsort($word_counts);
$count = (count($word_counts) < $count) ? count($word_counts) : $count;
$keywords = array_slice(array_keys($word_counts), 0, $count);
$keywords = implode(', ', $keywords);
$keywords = mb_substr($keywords, 3);
$keywords = RakePlus::create($content, 'en_US', $count)->keywords();
$keywords = implode(",", $keywords);
$cache->setCache($cacheKey, $keywords);
return $keywords;
}