Backend: Add list of small words #361

Signed-off-by: Michael Mayer <michael@liquidbytes.net>
This commit is contained in:
Michael Mayer 2020-06-24 07:38:08 +02:00
parent 5b312cc1b5
commit 4976788c5b
8 changed files with 2750 additions and 118 deletions

View file

@ -10,117 +10,6 @@ import (
var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-,':]{2,}")
var SpecialWords = map[string]string{
"nyc": "NYC",
"ny": "NY",
"uae": "UAE",
"usa": "USA",
"amd": "AMD",
"tiff": "TIFF",
"ibm": "IBM",
"usd": "USD",
"gbp": "GBP",
"chf": "CHF",
"ceo": "CEO",
"cto": "CTO",
"cfo": "CFO",
"cia": "CIA ",
"fbi": "FBI",
"bnd": "BND",
"fsb": "FSB",
"nsa": "NSA",
"lax": "LAX",
"sfx": "SFX",
"ber": "BER",
"sfo": "SFO",
"lh": "LH",
"lhr": "LHR",
"afl": "AFL",
"nrl": "NRL",
"nsw": "NSW",
"qld": "QLD",
"vic": "VIC",
"iphone": "iPhone",
"imac": "iMac",
"ipad": "iPad",
"ipod": "iPod",
"macbook": "MacBook",
"airplay": "AirPlay",
"airpods": "AirPods",
"youtube": "YouTube",
"photoprism": "PhotoPrism",
"macgyver": "MacGyver",
"o'brien": "O'Brien",
"mcgregor": "McGregor",
"mcdonald": "McDonald",
"mcdonalds": "McDonald's",
"mcdonald's": "McDonald's",
"macalister": "MacAlister",
"mcalister": "McAlister",
"mcallister": "McAllister",
"macauley": "MacAuley",
"mccauley": "McCauley",
"mcawley": "McAwley",
"macauliffe": "MacAuliffe",
"macbride": "MacBride",
"mcbride": "McBride",
"maccabe": "MacCabe",
"mccabe": "McCabe",
"maccann": "MacCann",
"mccann": "McCann",
"maccarthy": "MacCarthy",
"mccarthy": "McCarthy",
"maccormack": "MacCormack",
"mccormick": "McCormick",
"maccullagh": "MacCullagh",
"macnully": "MacNully",
"mackenna": "MacKenna",
"macnamara": "MacNamara",
"mcnamara": "McNamara",
"gelaende": "Gelände",
"schwaebisch": "Schwäbisch",
"schwaebische": "Schwäbische",
"aegypten": "Ägypten",
"muenchen": "München",
"wuerttemberg": "Württemberg",
"baden-wuerttemberg": "Baden-Württemberg",
"nuernberg": "Nürnberg",
"wuerzburg": "Würzburg",
"tubingen": "Tübingen",
"tuebingen": "Tübingen",
"koeln": "Köln",
"oesterreich": "Österreich",
"woerthersee": "Wörthersee",
"oeland": "Öland",
"schoenefeld": "Schönefeld",
"duesseldorf": "Düsseldorf",
"dusseldorf": "Düsseldorf",
"saarbrucken": "Saarbrücken",
"saarbruecken": "Saarbrücken",
"zuerich": "Zürich",
}
var SmallWords = map[string]bool{
"a": true,
"an": true,
"as": true,
"at": true,
"by": true,
"in": true,
"of": true,
"on": true,
"or": true,
"up": true,
"to": true,
"and": true,
"but": true,
"for": true,
"nor": true,
"the": true,
"from": true,
"with": true,
}
// isSeparator reports whether the rune could mark a word boundary.
func isSeparator(r rune) bool {
// ASCII alphanumerics and underscore are not separators
@ -215,7 +104,11 @@ func TitleFromFileName(s string) string {
continue
}
if _, ok := Stopwords[w]; ok && found == 0 {
if _, ok := StopWords[w]; ok && found == 0 {
continue
}
if UnknownWord(w) {
continue
}

View file

@ -50,8 +50,8 @@ func main() {
var packageTemplate = template.Must(template.New("").Parse(`// Code generated by go generate; DO NOT EDIT.
package txt
// Stopwords contains a list of stopwords for full-text indexing.
var Stopwords = map[string]bool{
// StopWords contains a list of stopwords for full-text indexing.
var StopWords = map[string]bool{
{{- range .Words }}
{{ printf "%q" . }}: true,
{{- end }}

2588
pkg/txt/shortwords.go Normal file

File diff suppressed because it is too large Load diff

22
pkg/txt/smallwords.go Normal file
View file

@ -0,0 +1,22 @@
package txt
var SmallWords = map[string]bool{
"a": true,
"an": true,
"as": true,
"at": true,
"by": true,
"in": true,
"of": true,
"on": true,
"or": true,
"up": true,
"to": true,
"and": true,
"but": true,
"for": true,
"nor": true,
"the": true,
"from": true,
"with": true,
}

91
pkg/txt/specialwords.go Normal file
View file

@ -0,0 +1,91 @@
package txt
var SpecialWords = map[string]string{
"nyc": "NYC",
"ny": "NY",
"uae": "UAE",
"usa": "USA",
"amd": "AMD",
"tiff": "TIFF",
"ibm": "IBM",
"usd": "USD",
"gbp": "GBP",
"chf": "CHF",
"ceo": "CEO",
"cto": "CTO",
"cfo": "CFO",
"cia": "CIA ",
"fbi": "FBI",
"bnd": "BND",
"fsb": "FSB",
"nsa": "NSA",
"lax": "LAX",
"sxf": "SXF",
"ber": "BER",
"sfo": "SFO",
"lh": "LH",
"lhr": "LHR",
"afl": "AFL",
"nrl": "NRL",
"nsw": "NSW",
"qld": "QLD",
"vic": "VIC",
"iphone": "iPhone",
"imac": "iMac",
"ipad": "iPad",
"ipod": "iPod",
"macbook": "MacBook",
"airplay": "AirPlay",
"airpods": "AirPods",
"youtube": "YouTube",
"photoprism": "PhotoPrism",
"macgyver": "MacGyver",
"o'brien": "O'Brien",
"mcgregor": "McGregor",
"mcdonald": "McDonald",
"mcdonalds": "McDonald's",
"mcdonald's": "McDonald's",
"macalister": "MacAlister",
"mcalister": "McAlister",
"mcallister": "McAllister",
"macauley": "MacAuley",
"mccauley": "McCauley",
"mcawley": "McAwley",
"macauliffe": "MacAuliffe",
"macbride": "MacBride",
"mcbride": "McBride",
"maccabe": "MacCabe",
"mccabe": "McCabe",
"maccann": "MacCann",
"mccann": "McCann",
"maccarthy": "MacCarthy",
"mccarthy": "McCarthy",
"maccormack": "MacCormack",
"mccormick": "McCormick",
"maccullagh": "MacCullagh",
"macnully": "MacNully",
"mackenna": "MacKenna",
"macnamara": "MacNamara",
"mcnamara": "McNamara",
"gelaende": "Gelände",
"schwaebisch": "Schwäbisch",
"schwaebische": "Schwäbische",
"aegypten": "Ägypten",
"muenchen": "München",
"wuerttemberg": "Württemberg",
"baden-wuerttemberg": "Baden-Württemberg",
"nuernberg": "Nürnberg",
"wuerzburg": "Würzburg",
"tubingen": "Tübingen",
"tuebingen": "Tübingen",
"koeln": "Köln",
"oesterreich": "Österreich",
"woerthersee": "Wörthersee",
"oeland": "Öland",
"schoenefeld": "Schönefeld",
"duesseldorf": "Düsseldorf",
"dusseldorf": "Düsseldorf",
"saarbrucken": "Saarbrücken",
"saarbruecken": "Saarbrücken",
"zuerich": "Zürich",
}

View file

@ -1,8 +1,8 @@
// Code generated by go generate; DO NOT EDIT.
package txt
// Stopwords contains a list of stopwords for full-text indexing.
var Stopwords = map[string]bool{
// StopWords contains a list of stopwords for full-text indexing.
var StopWords = map[string]bool{
"olymp": true,
"sony": true,
"canon": true,

View file

@ -22,3 +22,14 @@ func Bool(s string) bool {
return true
}
// ASCII returns true if the string only contains ascii chars without whitespace, numbers, and punctuation marks.
func ASCII(s string) bool {
for _, r := range s {
if (r < 65 || r > 90) && (r < 97 || r > 122) {
return false
}
}
return true
}

View file

@ -8,6 +8,25 @@ import (
var KeywordsRegexp = regexp.MustCompile("[\\p{L}\\-]{3,}")
// UnknownWord returns true if the string does not seem to be a real word.
func UnknownWord(s string) bool {
if len(s) > 3 || !ASCII(s) {
return false
}
s = strings.ToLower(s)
if _, ok := ShortWords[s]; ok {
return false
}
if _, ok := SpecialWords[s]; ok {
return false
}
return true
}
// Words returns a slice of words with at least 3 characters from a string, dashes count as character ("ile-de-france").
func Words(s string) (results []string) {
return KeywordsRegexp.FindAllString(s, -1)
@ -30,7 +49,11 @@ func FilenameKeywords(s string) (results []string) {
for _, w := range FilenameWords(s) {
w = strings.ToLower(w)
if _, ok := Stopwords[w]; ok == false {
if UnknownWord(w) {
continue
}
if _, ok := StopWords[w]; ok == false {
results = append(results, w)
}
}
@ -43,7 +66,11 @@ func Keywords(s string) (results []string) {
for _, w := range Words(s) {
w = strings.ToLower(w)
if _, ok := Stopwords[w]; ok == false {
if UnknownWord(w) {
continue
}
if _, ok := StopWords[w]; ok == false {
results = append(results, w)
}
}