Backend: Add list of small words #361
Signed-off-by: Michael Mayer <michael@liquidbytes.net>
This commit is contained in:
parent
5b312cc1b5
commit
4976788c5b
|
@ -10,117 +10,6 @@ import (
|
|||
|
||||
var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-,':]{2,}")
|
||||
|
||||
var SpecialWords = map[string]string{
|
||||
"nyc": "NYC",
|
||||
"ny": "NY",
|
||||
"uae": "UAE",
|
||||
"usa": "USA",
|
||||
"amd": "AMD",
|
||||
"tiff": "TIFF",
|
||||
"ibm": "IBM",
|
||||
"usd": "USD",
|
||||
"gbp": "GBP",
|
||||
"chf": "CHF",
|
||||
"ceo": "CEO",
|
||||
"cto": "CTO",
|
||||
"cfo": "CFO",
|
||||
"cia": "CIA ",
|
||||
"fbi": "FBI",
|
||||
"bnd": "BND",
|
||||
"fsb": "FSB",
|
||||
"nsa": "NSA",
|
||||
"lax": "LAX",
|
||||
"sfx": "SFX",
|
||||
"ber": "BER",
|
||||
"sfo": "SFO",
|
||||
"lh": "LH",
|
||||
"lhr": "LHR",
|
||||
"afl": "AFL",
|
||||
"nrl": "NRL",
|
||||
"nsw": "NSW",
|
||||
"qld": "QLD",
|
||||
"vic": "VIC",
|
||||
"iphone": "iPhone",
|
||||
"imac": "iMac",
|
||||
"ipad": "iPad",
|
||||
"ipod": "iPod",
|
||||
"macbook": "MacBook",
|
||||
"airplay": "AirPlay",
|
||||
"airpods": "AirPods",
|
||||
"youtube": "YouTube",
|
||||
"photoprism": "PhotoPrism",
|
||||
"macgyver": "MacGyver",
|
||||
"o'brien": "O'Brien",
|
||||
"mcgregor": "McGregor",
|
||||
"mcdonald": "McDonald",
|
||||
"mcdonalds": "McDonald's",
|
||||
"mcdonald's": "McDonald's",
|
||||
"macalister": "MacAlister",
|
||||
"mcalister": "McAlister",
|
||||
"mcallister": "McAllister",
|
||||
"macauley": "MacAuley",
|
||||
"mccauley": "McCauley",
|
||||
"mcawley": "McAwley",
|
||||
"macauliffe": "MacAuliffe",
|
||||
"macbride": "MacBride",
|
||||
"mcbride": "McBride",
|
||||
"maccabe": "MacCabe",
|
||||
"mccabe": "McCabe",
|
||||
"maccann": "MacCann",
|
||||
"mccann": "McCann",
|
||||
"maccarthy": "MacCarthy",
|
||||
"mccarthy": "McCarthy",
|
||||
"maccormack": "MacCormack",
|
||||
"mccormick": "McCormick",
|
||||
"maccullagh": "MacCullagh",
|
||||
"macnully": "MacNully",
|
||||
"mackenna": "MacKenna",
|
||||
"macnamara": "MacNamara",
|
||||
"mcnamara": "McNamara",
|
||||
"gelaende": "Gelände",
|
||||
"schwaebisch": "Schwäbisch",
|
||||
"schwaebische": "Schwäbische",
|
||||
"aegypten": "Ägypten",
|
||||
"muenchen": "München",
|
||||
"wuerttemberg": "Württemberg",
|
||||
"baden-wuerttemberg": "Baden-Württemberg",
|
||||
"nuernberg": "Nürnberg",
|
||||
"wuerzburg": "Würzburg",
|
||||
"tubingen": "Tübingen",
|
||||
"tuebingen": "Tübingen",
|
||||
"koeln": "Köln",
|
||||
"oesterreich": "Österreich",
|
||||
"woerthersee": "Wörthersee",
|
||||
"oeland": "Öland",
|
||||
"schoenefeld": "Schönefeld",
|
||||
"duesseldorf": "Düsseldorf",
|
||||
"dusseldorf": "Düsseldorf",
|
||||
"saarbrucken": "Saarbrücken",
|
||||
"saarbruecken": "Saarbrücken",
|
||||
"zuerich": "Zürich",
|
||||
}
|
||||
|
||||
var SmallWords = map[string]bool{
|
||||
"a": true,
|
||||
"an": true,
|
||||
"as": true,
|
||||
"at": true,
|
||||
"by": true,
|
||||
"in": true,
|
||||
"of": true,
|
||||
"on": true,
|
||||
"or": true,
|
||||
"up": true,
|
||||
"to": true,
|
||||
"and": true,
|
||||
"but": true,
|
||||
"for": true,
|
||||
"nor": true,
|
||||
"the": true,
|
||||
"from": true,
|
||||
"with": true,
|
||||
}
|
||||
|
||||
// isSeparator reports whether the rune could mark a word boundary.
|
||||
func isSeparator(r rune) bool {
|
||||
// ASCII alphanumerics and underscore are not separators
|
||||
|
@ -215,7 +104,11 @@ func TitleFromFileName(s string) string {
|
|||
continue
|
||||
}
|
||||
|
||||
if _, ok := Stopwords[w]; ok && found == 0 {
|
||||
if _, ok := StopWords[w]; ok && found == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if UnknownWord(w) {
|
||||
continue
|
||||
}
|
||||
|
||||
|
|
|
@ -50,8 +50,8 @@ func main() {
|
|||
var packageTemplate = template.Must(template.New("").Parse(`// Code generated by go generate; DO NOT EDIT.
|
||||
package txt
|
||||
|
||||
// Stopwords contains a list of stopwords for full-text indexing.
|
||||
var Stopwords = map[string]bool{
|
||||
// StopWords contains a list of stopwords for full-text indexing.
|
||||
var StopWords = map[string]bool{
|
||||
{{- range .Words }}
|
||||
{{ printf "%q" . }}: true,
|
||||
{{- end }}
|
||||
|
|
2588
pkg/txt/shortwords.go
Normal file
2588
pkg/txt/shortwords.go
Normal file
File diff suppressed because it is too large
Load diff
22
pkg/txt/smallwords.go
Normal file
22
pkg/txt/smallwords.go
Normal file
|
@ -0,0 +1,22 @@
|
|||
package txt
|
||||
|
||||
var SmallWords = map[string]bool{
|
||||
"a": true,
|
||||
"an": true,
|
||||
"as": true,
|
||||
"at": true,
|
||||
"by": true,
|
||||
"in": true,
|
||||
"of": true,
|
||||
"on": true,
|
||||
"or": true,
|
||||
"up": true,
|
||||
"to": true,
|
||||
"and": true,
|
||||
"but": true,
|
||||
"for": true,
|
||||
"nor": true,
|
||||
"the": true,
|
||||
"from": true,
|
||||
"with": true,
|
||||
}
|
91
pkg/txt/specialwords.go
Normal file
91
pkg/txt/specialwords.go
Normal file
|
@ -0,0 +1,91 @@
|
|||
package txt
|
||||
|
||||
var SpecialWords = map[string]string{
|
||||
"nyc": "NYC",
|
||||
"ny": "NY",
|
||||
"uae": "UAE",
|
||||
"usa": "USA",
|
||||
"amd": "AMD",
|
||||
"tiff": "TIFF",
|
||||
"ibm": "IBM",
|
||||
"usd": "USD",
|
||||
"gbp": "GBP",
|
||||
"chf": "CHF",
|
||||
"ceo": "CEO",
|
||||
"cto": "CTO",
|
||||
"cfo": "CFO",
|
||||
"cia": "CIA ",
|
||||
"fbi": "FBI",
|
||||
"bnd": "BND",
|
||||
"fsb": "FSB",
|
||||
"nsa": "NSA",
|
||||
"lax": "LAX",
|
||||
"sxf": "SXF",
|
||||
"ber": "BER",
|
||||
"sfo": "SFO",
|
||||
"lh": "LH",
|
||||
"lhr": "LHR",
|
||||
"afl": "AFL",
|
||||
"nrl": "NRL",
|
||||
"nsw": "NSW",
|
||||
"qld": "QLD",
|
||||
"vic": "VIC",
|
||||
"iphone": "iPhone",
|
||||
"imac": "iMac",
|
||||
"ipad": "iPad",
|
||||
"ipod": "iPod",
|
||||
"macbook": "MacBook",
|
||||
"airplay": "AirPlay",
|
||||
"airpods": "AirPods",
|
||||
"youtube": "YouTube",
|
||||
"photoprism": "PhotoPrism",
|
||||
"macgyver": "MacGyver",
|
||||
"o'brien": "O'Brien",
|
||||
"mcgregor": "McGregor",
|
||||
"mcdonald": "McDonald",
|
||||
"mcdonalds": "McDonald's",
|
||||
"mcdonald's": "McDonald's",
|
||||
"macalister": "MacAlister",
|
||||
"mcalister": "McAlister",
|
||||
"mcallister": "McAllister",
|
||||
"macauley": "MacAuley",
|
||||
"mccauley": "McCauley",
|
||||
"mcawley": "McAwley",
|
||||
"macauliffe": "MacAuliffe",
|
||||
"macbride": "MacBride",
|
||||
"mcbride": "McBride",
|
||||
"maccabe": "MacCabe",
|
||||
"mccabe": "McCabe",
|
||||
"maccann": "MacCann",
|
||||
"mccann": "McCann",
|
||||
"maccarthy": "MacCarthy",
|
||||
"mccarthy": "McCarthy",
|
||||
"maccormack": "MacCormack",
|
||||
"mccormick": "McCormick",
|
||||
"maccullagh": "MacCullagh",
|
||||
"macnully": "MacNully",
|
||||
"mackenna": "MacKenna",
|
||||
"macnamara": "MacNamara",
|
||||
"mcnamara": "McNamara",
|
||||
"gelaende": "Gelände",
|
||||
"schwaebisch": "Schwäbisch",
|
||||
"schwaebische": "Schwäbische",
|
||||
"aegypten": "Ägypten",
|
||||
"muenchen": "München",
|
||||
"wuerttemberg": "Württemberg",
|
||||
"baden-wuerttemberg": "Baden-Württemberg",
|
||||
"nuernberg": "Nürnberg",
|
||||
"wuerzburg": "Würzburg",
|
||||
"tubingen": "Tübingen",
|
||||
"tuebingen": "Tübingen",
|
||||
"koeln": "Köln",
|
||||
"oesterreich": "Österreich",
|
||||
"woerthersee": "Wörthersee",
|
||||
"oeland": "Öland",
|
||||
"schoenefeld": "Schönefeld",
|
||||
"duesseldorf": "Düsseldorf",
|
||||
"dusseldorf": "Düsseldorf",
|
||||
"saarbrucken": "Saarbrücken",
|
||||
"saarbruecken": "Saarbrücken",
|
||||
"zuerich": "Zürich",
|
||||
}
|
|
@ -1,8 +1,8 @@
|
|||
// Code generated by go generate; DO NOT EDIT.
|
||||
package txt
|
||||
|
||||
// Stopwords contains a list of stopwords for full-text indexing.
|
||||
var Stopwords = map[string]bool{
|
||||
// StopWords contains a list of stopwords for full-text indexing.
|
||||
var StopWords = map[string]bool{
|
||||
"olymp": true,
|
||||
"sony": true,
|
||||
"canon": true,
|
||||
|
|
|
@ -22,3 +22,14 @@ func Bool(s string) bool {
|
|||
|
||||
return true
|
||||
}
|
||||
|
||||
// ASCII returns true if the string only contains ascii chars without whitespace, numbers, and punctuation marks.
|
||||
func ASCII(s string) bool {
|
||||
for _, r := range s {
|
||||
if (r < 65 || r > 90) && (r < 97 || r > 122) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
|
|
@ -8,6 +8,25 @@ import (
|
|||
|
||||
var KeywordsRegexp = regexp.MustCompile("[\\p{L}\\-]{3,}")
|
||||
|
||||
// UnknownWord returns true if the string does not seem to be a real word.
|
||||
func UnknownWord(s string) bool {
|
||||
if len(s) > 3 || !ASCII(s) {
|
||||
return false
|
||||
}
|
||||
|
||||
s = strings.ToLower(s)
|
||||
|
||||
if _, ok := ShortWords[s]; ok {
|
||||
return false
|
||||
}
|
||||
|
||||
if _, ok := SpecialWords[s]; ok {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// Words returns a slice of words with at least 3 characters from a string, dashes count as character ("ile-de-france").
|
||||
func Words(s string) (results []string) {
|
||||
return KeywordsRegexp.FindAllString(s, -1)
|
||||
|
@ -30,7 +49,11 @@ func FilenameKeywords(s string) (results []string) {
|
|||
for _, w := range FilenameWords(s) {
|
||||
w = strings.ToLower(w)
|
||||
|
||||
if _, ok := Stopwords[w]; ok == false {
|
||||
if UnknownWord(w) {
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := StopWords[w]; ok == false {
|
||||
results = append(results, w)
|
||||
}
|
||||
}
|
||||
|
@ -43,7 +66,11 @@ func Keywords(s string) (results []string) {
|
|||
for _, w := range Words(s) {
|
||||
w = strings.ToLower(w)
|
||||
|
||||
if _, ok := Stopwords[w]; ok == false {
|
||||
if UnknownWord(w) {
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := StopWords[w]; ok == false {
|
||||
results = append(results, w)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue