Index shorter keywords in languages like Chinese #746
Signed-off-by: Michael Mayer <michael@liquidbytes.net>
This commit is contained in:
parent
00a768173f
commit
91acaaa573
|
@ -324,6 +324,11 @@ func TestPhoto_GetDetails(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestPhoto_FileTitle(t *testing.T) {
|
||||
t.Run("non-latin", func(t *testing.T) {
|
||||
photo := Photo{PhotoName: "桥", PhotoPath: "", OriginalName: ""}
|
||||
result := photo.FileTitle()
|
||||
assert.Equal(t, "桥", result)
|
||||
})
|
||||
t.Run("changing-of-the-guard--buckingham-palace_7925318070_o.jpg", func(t *testing.T) {
|
||||
photo := Photo{PhotoName: "20200102_194030_9EFA9E5E", PhotoPath: "2000/05", OriginalName: "flickr import/changing-of-the-guard--buckingham-palace_7925318070_o.jpg"}
|
||||
result := photo.FileTitle()
|
||||
|
|
|
@ -109,10 +109,6 @@ func PhotoSearch(f form.PhotoSearch) (results PhotoResults, count int, err error
|
|||
s = s.Where("photos.id IN (SELECT pk.photo_id FROM keywords k JOIN photos_keywords pk ON k.id = pk.keyword_id WHERE (?))", gorm.Expr(likeAny))
|
||||
}
|
||||
} else if f.Query != "" {
|
||||
if len(f.Query) < 2 {
|
||||
return results, 0, fmt.Errorf("query too short")
|
||||
}
|
||||
|
||||
if err := Db().Where(AnySlug("custom_slug", f.Query, " ")).Find(&labels).Error; len(labels) == 0 || err != nil {
|
||||
log.Infof("search: label %s not found, using fuzzy search", txt.Quote(f.Query))
|
||||
|
||||
|
|
|
@ -137,18 +137,6 @@ func TestPhotoSearch(t *testing.T) {
|
|||
|
||||
assert.LessOrEqual(t, 1, len(photos))
|
||||
})
|
||||
t.Run("query too short", func(t *testing.T) {
|
||||
var f form.PhotoSearch
|
||||
f.Query = "a"
|
||||
f.Count = 5000
|
||||
f.Offset = 0
|
||||
f.Geo = false
|
||||
|
||||
photos, _, err := PhotoSearch(f)
|
||||
|
||||
assert.Equal(t, "query too short", err.Error())
|
||||
assert.Empty(t, photos)
|
||||
})
|
||||
t.Run("search for keyword", func(t *testing.T) {
|
||||
var f form.PhotoSearch
|
||||
f.Query = "bridge"
|
||||
|
@ -156,9 +144,11 @@ func TestPhotoSearch(t *testing.T) {
|
|||
f.Offset = 0
|
||||
|
||||
photos, _, err := PhotoSearch(f)
|
||||
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
assert.LessOrEqual(t, 2, len(photos))
|
||||
})
|
||||
t.Run("search for label in query", func(t *testing.T) {
|
||||
|
@ -168,9 +158,11 @@ func TestPhotoSearch(t *testing.T) {
|
|||
f.Offset = 0
|
||||
|
||||
photos, _, err := PhotoSearch(f)
|
||||
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
assert.LessOrEqual(t, 1, len(photos))
|
||||
})
|
||||
t.Run("search for archived", func(t *testing.T) {
|
||||
|
|
|
@ -109,6 +109,10 @@ func LikeAny(col, search string) (where string) {
|
|||
wheres = append(wheres, fmt.Sprintf("%s = '%s'", col, w))
|
||||
}
|
||||
|
||||
if !txt.ContainsASCIILetters(w) {
|
||||
continue
|
||||
}
|
||||
|
||||
singular := inflection.Singular(w)
|
||||
|
||||
if singular != w {
|
||||
|
@ -137,6 +141,10 @@ func AnySlug(col, search, sep string) (where string) {
|
|||
|
||||
words = append(words, slug.Make(w))
|
||||
|
||||
if !txt.ContainsASCIILetters(w) {
|
||||
continue
|
||||
}
|
||||
|
||||
singular := inflection.Singular(w)
|
||||
|
||||
if singular != w {
|
||||
|
|
54
pkg/txt/contains.go
Normal file
54
pkg/txt/contains.go
Normal file
|
@ -0,0 +1,54 @@
|
|||
package txt
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
var ContainsNumberRegexp = regexp.MustCompile("\\d+")
|
||||
|
||||
// ContainsNumber returns true if string contains a number.
|
||||
func ContainsNumber(s string) bool {
|
||||
return ContainsNumberRegexp.MatchString(s)
|
||||
}
|
||||
|
||||
// ContainsLetters reports whether the string only contains letters.
|
||||
func ContainsLetters(s string) bool {
|
||||
if s == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, r := range s {
|
||||
if !unicode.IsLetter(r) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// ContainsASCIILetters reports if the string only contains ascii chars without whitespace, numbers, and punctuation marks.
|
||||
func ContainsASCIILetters(s string) bool {
|
||||
for _, r := range s {
|
||||
if (r < 65 || r > 90) && (r < 97 || r > 122) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// ContainsSymbols reports whether the string only contains symbolic characters.
|
||||
func ContainsSymbols(s string) bool {
|
||||
if s == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, r := range s {
|
||||
if !unicode.IsSymbol(r) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
88
pkg/txt/contains_test.go
Normal file
88
pkg/txt/contains_test.go
Normal file
|
@ -0,0 +1,88 @@
|
|||
package txt
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestContainsNumber(t *testing.T) {
|
||||
t.Run("True", func(t *testing.T) {
|
||||
assert.Equal(t, true, ContainsNumber("f3abcde"))
|
||||
})
|
||||
t.Run("False", func(t *testing.T) {
|
||||
assert.Equal(t, false, ContainsNumber("abcd"))
|
||||
})
|
||||
}
|
||||
|
||||
func TestContainsSymbols(t *testing.T) {
|
||||
t.Run("123", func(t *testing.T) {
|
||||
assert.False(t, ContainsSymbols("123"))
|
||||
})
|
||||
t.Run("The quick brown fox.", func(t *testing.T) {
|
||||
assert.False(t, ContainsSymbols("The quick brown fox."))
|
||||
})
|
||||
t.Run("bridge", func(t *testing.T) {
|
||||
assert.False(t, ContainsSymbols("bridge"))
|
||||
})
|
||||
t.Run("桥", func(t *testing.T) {
|
||||
assert.False(t, ContainsSymbols("桥"))
|
||||
})
|
||||
t.Run("桥船", func(t *testing.T) {
|
||||
assert.False(t, ContainsSymbols("桥船"))
|
||||
})
|
||||
t.Run("स्थान", func(t *testing.T) {
|
||||
assert.False(t, ContainsSymbols("स्थान"))
|
||||
})
|
||||
t.Run("réseau", func(t *testing.T) {
|
||||
assert.False(t, ContainsSymbols("réseau"))
|
||||
})
|
||||
}
|
||||
|
||||
func TestContainsLetters(t *testing.T) {
|
||||
t.Run("123", func(t *testing.T) {
|
||||
assert.False(t, ContainsLetters("123"))
|
||||
})
|
||||
t.Run("The quick brown fox.", func(t *testing.T) {
|
||||
assert.False(t, ContainsLetters("The quick brown fox."))
|
||||
})
|
||||
t.Run("bridge", func(t *testing.T) {
|
||||
assert.True(t, ContainsLetters("bridge"))
|
||||
})
|
||||
t.Run("桥", func(t *testing.T) {
|
||||
assert.True(t, ContainsLetters("桥"))
|
||||
})
|
||||
t.Run("桥船", func(t *testing.T) {
|
||||
assert.True(t, ContainsLetters("桥船"))
|
||||
})
|
||||
t.Run("स्थान", func(t *testing.T) {
|
||||
assert.False(t, ContainsLetters("स्थान"))
|
||||
})
|
||||
t.Run("réseau", func(t *testing.T) {
|
||||
assert.True(t, ContainsLetters("réseau"))
|
||||
})
|
||||
}
|
||||
|
||||
func TestContainsASCIILetters(t *testing.T) {
|
||||
t.Run("123", func(t *testing.T) {
|
||||
assert.False(t, ContainsASCIILetters("123"))
|
||||
})
|
||||
t.Run("The quick brown fox.", func(t *testing.T) {
|
||||
assert.False(t, ContainsASCIILetters("The quick brown fox."))
|
||||
})
|
||||
t.Run("bridge", func(t *testing.T) {
|
||||
assert.True(t, ContainsASCIILetters("bridge"))
|
||||
})
|
||||
t.Run("桥", func(t *testing.T) {
|
||||
assert.False(t, ContainsASCIILetters("桥"))
|
||||
})
|
||||
t.Run("桥船", func(t *testing.T) {
|
||||
assert.False(t, ContainsASCIILetters("桥船"))
|
||||
})
|
||||
t.Run("स्थान", func(t *testing.T) {
|
||||
assert.False(t, ContainsASCIILetters("स्थान"))
|
||||
})
|
||||
t.Run("réseau", func(t *testing.T) {
|
||||
assert.False(t, ContainsASCIILetters("réseau"))
|
||||
})
|
||||
}
|
|
@ -7,13 +7,13 @@ import (
|
|||
"github.com/photoprism/photoprism/pkg/fs"
|
||||
)
|
||||
|
||||
var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-,':&+]{2,}|( [&+] )?")
|
||||
var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-,':&+!?]{1,}|( [&+] )?")
|
||||
|
||||
// FileTitle returns the string with the first characters of each word converted to uppercase.
|
||||
func FileTitle(s string) string {
|
||||
s = fs.BasePrefix(s, true)
|
||||
|
||||
if len(s) < 3 {
|
||||
if len(s) < 3 && IsASCII(s) {
|
||||
return ""
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,7 @@ func FileTitle(s string) string {
|
|||
for _, w := range words {
|
||||
w = strings.ToLower(w)
|
||||
|
||||
if len(w) < 3 && found == 0 {
|
||||
if IsASCII(w) && (len(w) < 3 && found == 0 || len(w) == 1) {
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -56,7 +56,7 @@ func FileTitle(s string) string {
|
|||
title = strings.ReplaceAll(title, "-", " ")
|
||||
title = strings.ReplaceAll(title, " ", " ")
|
||||
|
||||
if len(title) <= 4 {
|
||||
if len(title) <= 4 && IsASCII(title) {
|
||||
return ""
|
||||
}
|
||||
|
||||
|
|
|
@ -7,6 +7,12 @@ import (
|
|||
)
|
||||
|
||||
func TestFileTitle(t *testing.T) {
|
||||
t.Run("桥", func(t *testing.T) {
|
||||
assert.Equal(t, "桥", FileTitle("桥"))
|
||||
})
|
||||
t.Run("i_love_you!", func(t *testing.T) {
|
||||
assert.Equal(t, "Love You!", FileTitle("i_love_you!"))
|
||||
})
|
||||
t.Run("photoprism", func(t *testing.T) {
|
||||
assert.Equal(t, "PhotoPrism: Browse Your Life in Pictures", FileTitle("photoprism: Browse your life in pictures"))
|
||||
})
|
||||
|
|
43
pkg/txt/is.go
Normal file
43
pkg/txt/is.go
Normal file
|
@ -0,0 +1,43 @@
|
|||
package txt
|
||||
|
||||
import "unicode"
|
||||
|
||||
// Is reports whether the all string runes are in the specified range.
|
||||
func Is(rangeTab *unicode.RangeTable, s string) bool {
|
||||
if s == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, r := range s {
|
||||
if !unicode.Is(rangeTab, r) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// IsASCII tests if the string only contains ascii runes.
|
||||
func IsASCII(s string) bool {
|
||||
for i := 0; i < len(s); i++ {
|
||||
if s[i] > unicode.MaxASCII {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// IsLatin reports whether the string only contains latin letters.
|
||||
func IsLatin(s string) bool {
|
||||
if s == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, r := range s {
|
||||
if !unicode.Is(unicode.Latin, r) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
87
pkg/txt/is_test.go
Normal file
87
pkg/txt/is_test.go
Normal file
|
@ -0,0 +1,87 @@
|
|||
package txt
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"unicode"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestIs(t *testing.T) {
|
||||
t.Run("The quick brown fox.", func(t *testing.T) {
|
||||
assert.False(t, Is(unicode.Latin, "The quick brown fox."))
|
||||
assert.False(t, Is(unicode.L, "The quick brown fox."))
|
||||
assert.False(t, Is(unicode.Letter, "The quick brown fox."))
|
||||
})
|
||||
t.Run("bridge", func(t *testing.T) {
|
||||
assert.True(t, Is(unicode.Latin, "bridge"))
|
||||
assert.True(t, Is(unicode.L, "bridge"))
|
||||
assert.True(t, Is(unicode.Letter, "bridge"))
|
||||
})
|
||||
t.Run("桥", func(t *testing.T) {
|
||||
assert.False(t, Is(unicode.Latin, "桥"))
|
||||
assert.True(t, Is(unicode.L, "桥"))
|
||||
assert.True(t, Is(unicode.Letter, "桥"))
|
||||
})
|
||||
t.Run("桥船", func(t *testing.T) {
|
||||
assert.False(t, Is(unicode.Latin, "桥船"))
|
||||
assert.True(t, Is(unicode.L, "桥船"))
|
||||
assert.True(t, Is(unicode.Letter, "桥船"))
|
||||
})
|
||||
t.Run("स्थान", func(t *testing.T) {
|
||||
assert.False(t, Is(unicode.Latin, "स्थान"))
|
||||
assert.False(t, Is(unicode.L, "स्थान"))
|
||||
assert.False(t, Is(unicode.Letter, "स्थान"))
|
||||
assert.False(t, Is(unicode.Tamil, "स्थान"))
|
||||
})
|
||||
t.Run("réseau", func(t *testing.T) {
|
||||
assert.True(t, Is(unicode.Latin, "réseau"))
|
||||
assert.True(t, Is(unicode.L, "réseau"))
|
||||
assert.True(t, Is(unicode.Letter, "réseau"))
|
||||
})
|
||||
}
|
||||
|
||||
func TestIsASCII(t *testing.T) {
|
||||
t.Run("123", func(t *testing.T) {
|
||||
assert.True(t, IsASCII("123"))
|
||||
})
|
||||
t.Run("The quick brown fox.", func(t *testing.T) {
|
||||
assert.True(t, IsASCII("The quick brown fox."))
|
||||
})
|
||||
t.Run("bridge", func(t *testing.T) {
|
||||
assert.True(t, IsASCII("bridge"))
|
||||
})
|
||||
t.Run("桥", func(t *testing.T) {
|
||||
assert.False(t, IsASCII("桥"))
|
||||
})
|
||||
t.Run("桥船", func(t *testing.T) {
|
||||
assert.False(t, IsASCII("桥船"))
|
||||
})
|
||||
t.Run("स्थान", func(t *testing.T) {
|
||||
assert.False(t, IsASCII("स्थान"))
|
||||
})
|
||||
t.Run("réseau", func(t *testing.T) {
|
||||
assert.False(t, IsASCII("réseau"))
|
||||
})
|
||||
}
|
||||
|
||||
func TestIsLatin(t *testing.T) {
|
||||
t.Run("The quick brown fox.", func(t *testing.T) {
|
||||
assert.False(t, IsLatin("The quick brown fox."))
|
||||
})
|
||||
t.Run("bridge", func(t *testing.T) {
|
||||
assert.True(t, IsLatin("bridge"))
|
||||
})
|
||||
t.Run("桥", func(t *testing.T) {
|
||||
assert.False(t, IsLatin("桥"))
|
||||
})
|
||||
t.Run("桥船", func(t *testing.T) {
|
||||
assert.False(t, IsLatin("桥船"))
|
||||
})
|
||||
t.Run("स्थान", func(t *testing.T) {
|
||||
assert.False(t, IsLatin("स्थान"))
|
||||
})
|
||||
t.Run("réseau", func(t *testing.T) {
|
||||
assert.True(t, IsLatin("réseau"))
|
||||
})
|
||||
}
|
|
@ -1,17 +1,9 @@
|
|||
package txt
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var ContainsNumberRegexp = regexp.MustCompile("\\d+")
|
||||
|
||||
// ContainsNumber returns true if string contains a number.
|
||||
func ContainsNumber(s string) bool {
|
||||
return ContainsNumberRegexp.MatchString(s)
|
||||
}
|
||||
|
||||
// Bool casts a string to bool.
|
||||
func Bool(s string) bool {
|
||||
s = strings.TrimSpace(s)
|
||||
|
@ -22,14 +14,3 @@ func Bool(s string) bool {
|
|||
|
||||
return true
|
||||
}
|
||||
|
||||
// ASCII returns true if the string only contains ascii chars without whitespace, numbers, and punctuation marks.
|
||||
func ASCII(s string) bool {
|
||||
for _, r := range s {
|
||||
if (r < 65 || r > 90) && (r < 97 || r > 122) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
|
|
@ -6,15 +6,6 @@ import (
|
|||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestContainsNumber(t *testing.T) {
|
||||
t.Run("True", func(t *testing.T) {
|
||||
assert.Equal(t, true, ContainsNumber("f3abcde"))
|
||||
})
|
||||
t.Run("False", func(t *testing.T) {
|
||||
assert.Equal(t, false, ContainsNumber("abcd"))
|
||||
})
|
||||
}
|
||||
|
||||
func TestBool(t *testing.T) {
|
||||
t.Run("not empty", func(t *testing.T) {
|
||||
assert.Equal(t, true, Bool("Browse your life in pictures"))
|
||||
|
|
|
@ -6,11 +6,11 @@ import (
|
|||
"strings"
|
||||
)
|
||||
|
||||
var KeywordsRegexp = regexp.MustCompile("[\\p{L}\\-]{3,}")
|
||||
var KeywordsRegexp = regexp.MustCompile("[\\p{L}\\-]{1,}")
|
||||
|
||||
// UnknownWord returns true if the string does not seem to be a real word.
|
||||
func UnknownWord(s string) bool {
|
||||
if len(s) > 3 || !ASCII(s) {
|
||||
if len(s) > 3 || !ContainsASCIILetters(s) {
|
||||
return false
|
||||
}
|
||||
|
||||
|
@ -29,7 +29,15 @@ func UnknownWord(s string) bool {
|
|||
|
||||
// Words returns a slice of words with at least 3 characters from a string, dashes count as character ("ile-de-france").
|
||||
func Words(s string) (results []string) {
|
||||
return KeywordsRegexp.FindAllString(s, -1)
|
||||
for _, s := range KeywordsRegexp.FindAllString(s, -1) {
|
||||
if len(s) < 3 && IsLatin(s) {
|
||||
continue
|
||||
}
|
||||
|
||||
results = append(results, s)
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// ReplaceSpaces replaces all spaces with another string.
|
||||
|
@ -37,11 +45,19 @@ func ReplaceSpaces(s string, char string) string {
|
|||
return strings.Replace(s, " ", char, -1)
|
||||
}
|
||||
|
||||
var FilenameKeywordsRegexp = regexp.MustCompile("[\\p{L}]{3,}")
|
||||
var FilenameKeywordsRegexp = regexp.MustCompile("[\\p{L}]{1,}")
|
||||
|
||||
// FilenameWords returns a slice of words with at least 3 characters from a string ("ile", "france").
|
||||
func FilenameWords(s string) (results []string) {
|
||||
return FilenameKeywordsRegexp.FindAllString(s, -1)
|
||||
for _, s := range FilenameKeywordsRegexp.FindAllString(s, -1) {
|
||||
if len(s) < 3 && IsLatin(s) {
|
||||
continue
|
||||
}
|
||||
|
||||
results = append(results, s)
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// FilenameKeywords returns a slice of keywords without stopwords.
|
||||
|
@ -87,7 +103,7 @@ func UniqueWords(words []string) (results []string) {
|
|||
for _, w := range words {
|
||||
w = strings.ToLower(w)
|
||||
|
||||
if len(w) < 3 || w == last {
|
||||
if len(w) < 3 && IsLatin(w) || w == last {
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -109,7 +125,7 @@ func RemoveFromWords(words []string, remove string) (results []string) {
|
|||
for _, w := range words {
|
||||
w = strings.ToLower(w)
|
||||
|
||||
if len(w) < 3 || w == last || strings.Contains(remove, w) {
|
||||
if len(w) < 3 && IsLatin(w) || w == last || strings.Contains(remove, w) {
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -132,7 +148,7 @@ func UniqueKeywords(s string) (results []string) {
|
|||
for _, w := range words {
|
||||
w = strings.ToLower(w)
|
||||
|
||||
if len(w) < 3 || w == last {
|
||||
if len(w) < 3 && IsLatin(w) || w == last {
|
||||
continue
|
||||
}
|
||||
|
||||
|
|
|
@ -7,6 +7,10 @@ import (
|
|||
)
|
||||
|
||||
func TestWords(t *testing.T) {
|
||||
t.Run("桥", func(t *testing.T) {
|
||||
result := Words("桥")
|
||||
assert.Equal(t, []string{"桥"}, result)
|
||||
})
|
||||
t.Run("I'm a lazy-brown fox!", func(t *testing.T) {
|
||||
result := Words("I'm a lazy-BRoWN fox!")
|
||||
assert.Equal(t, []string{"lazy-BRoWN", "fox"}, result)
|
||||
|
@ -60,6 +64,10 @@ func TestFilenameWords(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestFilenameKeywords(t *testing.T) {
|
||||
t.Run("桥.jpg", func(t *testing.T) {
|
||||
result := FilenameKeywords("桥.jpg")
|
||||
assert.Equal(t, []string{"桥"}, result)
|
||||
})
|
||||
t.Run("I'm a lazy-brown var fox.jpg!", func(t *testing.T) {
|
||||
result := FilenameKeywords("I'm a lazy-brown var fox.jpg!")
|
||||
assert.Equal(t, []string{"lazy", "brown", "fox"}, result)
|
||||
|
|
Loading…
Reference in a new issue