How to sort file names naturally
1. Problem
I recently wrote a tool gomdtoc to generate a Table of Contents for my notes directory. In order to sort the file names, I used the sort.Strings()
.
But it not working as I expected, the problem is: it sorts the names alphabetically.
Suppose we have a list like this:
ab abc1 abc01 abc2 abc5 abc10
After sorting by sort.Strings()
ab abc01 abc1 abc10 abc2 abc5
The abc2
should be placed before abc10
, this is not expected, I want it sorted naturally.
2. Natual Sort Order
In computing, natural sort order (or natural sorting) is the ordering of strings in alphabetical order, except that multi-digit numbers are treated atomically, i.e., as if they were a single character. Natural sort order has been promoted as being more human-friendly ("natural") than machine-oriented, pure alphabetical sort order.
3. Implementations
For the full code see https://github.com/dreamjz/golang-notes/blob/main/blog/natural-sort/natural_sort.go
3.1 Using regular expression to separate number/string
The first thing that comes to my mind is using regexp
to separate number and string.
func NaturalLess03(str1, str2 string) bool {
pStr1 := processWithRegexp(str1)
pStr2 := processWithRegexp(str2)
minLen := int(math.Min(float64(len(pStr1)), float64(len(pStr2))))
for i := 0; i < minLen; i++ {
e1 := pStr1[i]
e2 := pStr2[i]
if e1 == e2 {
continue
}
ei1, err1 := strconv.Atoi(e1)
ei2, err2 := strconv.Atoi(e2)
switch {
case err1 == nil && err2 == nil && ei1 != ei2:
return ei1 < ei2
case err1 != nil && err2 != nil:
return e1 < e2
case err1 != nil:
return true
case err2 != nil:
return false
}
}
return len(str1) < len(str2)
}
func processWithRegexp(str string) []string {
reg := regexp.MustCompile(`\d+|\D+`)
return reg.FindAllString(str, -1)
}
3.2 Using pointer/index to separate number/string
The approach is the same as above, but we use the index to separate number and string.
func NaturalLess02(str1, str2 string) bool {
pStr1 := processStr(str1)
pStr2 := processStr(str2)
minLen := int(math.Min(float64(len(pStr1)), float64(len(pStr2))))
for i := 0; i < minLen; i++ {
e1 := pStr1[i]
e2 := pStr2[i]
if e1 == e2 {
continue
}
ei1, err1 := strconv.Atoi(e1)
ei2, err2 := strconv.Atoi(e2)
switch {
case err1 == nil && err2 == nil && ei1 != ei2:
return ei1 < ei2
case err1 != nil && err2 != nil:
return e1 < e2
case err1 != nil:
return true
case err2 != nil:
return false
}
}
return len(str1) < len(str2)
}
func processStr(str string) []string {
var processedStr []string
for i := 0; i < len(str); {
j := i + 1
for ; j < len(str); j++ {
if isDigit(str[i]) != isDigit(str[j]) {
break
}
}
processedStr = append(processedStr, str[i:j])
i = j
}
return processedStr
}
func isDigit(b byte) bool {
return '0' <= b && b <= '9'
}
3.3 Using pointer/index without separating number/string (Recommended)
This approach compares rune-by-rune, without separating numbers or strings.
Code from https://github.com/fvbommel/sortorder/blob/main/natsort.go
func NaturalLess01(str1, str2 string) bool {
idx1, idx2 := 0, 0
for idx1 < len(str1) && idx2 < len(str2) {
c1, c2 := str1[idx1], str2[idx2]
dig1, dig2 := isDigit(c1), isDigit(c2)
switch {
case dig1 != dig2: // Digits before other characters
return dig1
case !dig1: // && !dig2
if c1 != c2 {
return c1 < c2
}
idx1++
idx2++
default: // Digits
// Eat zeros
for ; idx1 < len(str1) && str1[idx1] == '0'; idx1++ {
}
for ; idx2 < len(str2) && str2[idx2] == '0'; idx2++ {
}
// Eat all digits
nonZero1, nonZero2 := idx1, idx2
for ; idx1 < len(str1) && isDigit(str1[idx1]); idx1++ {
}
for ; idx2 < len(str2) && isDigit(str2[idx2]); idx2++ {
}
// If lengths of numbers with non-zero prefix differ, the shorter
// one is less.
if len1, len2 := idx1-nonZero1, idx2-nonZero2; len1 != len2 {
return len1 < len2
}
// If they're equally long, string comparison is correct.
if nr1, nr2 := str1[nonZero1:idx1], str2[nonZero2:idx2]; nr1 != nr2 {
return nr1 < nr2
}
// Otherwise, the one with less zeros is less.
// Because everything up to the number is equal, comparing the index
// after the zeros is sufficient.
if nonZero1 != nonZero2 {
return nonZero1 < nonZero2
}
}
}
// So far they are identical. At least one is ended. If the other continues,
// it sorts last.
return len(str1) < len(str2)
}
3.4 Simple test
const PrintFormat = "%-25s: %v\n"
func main() {
strs := []string{
"ab", "abc1",
"abc01", "abc2",
"abc5", "abc10",
}
fmt.Printf(PrintFormat, "Origin", strs)
var strs00, strs01, strs02, strs03 []string
strs00 = make([]string, len(strs))
strs01 = make([]string, len(strs))
strs02 = make([]string, len(strs))
strs03 = make([]string, len(strs))
copy(strs00, strs)
sort.Strings(strs00)
fmt.Printf(PrintFormat, "Sort by stdsorter", strs00)
copy(strs01, strs)
sort.Slice(strs01, func(i, j int) bool {
return NaturalLess01(strs01[i], strs01[j])
})
fmt.Printf(PrintFormat, "Sort by NaturalLess01", strs01)
copy(strs02, strs)
sort.Slice(strs02, func(i, j int) bool {
return NaturalLess02(strs02[i], strs02[j])
})
fmt.Printf(PrintFormat, "Sort by NaturalLess02", strs02)
copy(strs03, strs)
sort.Slice(strs03, func(i, j int) bool {
return NaturalLess03(strs03[i], strs03[j])
})
fmt.Printf(PrintFormat, "Sort by NaturalLess03", strs03)
}
Origin : [ab abc1 abc01 abc2 abc5 abc10]
Sort by stdsorter : [ab abc01 abc1 abc10 abc2 abc5]
Sort by NaturalLess01 : [ab abc1 abc01 abc2 abc5 abc10]
Sort by NaturalLess02 : [ab abc1 abc01 abc2 abc5 abc10]
Sort by NaturalLess03 : [ab abc1 abc01 abc2 abc5 abc10]
4. Benchmark
4.1 Generate test set
type generator struct {
src *rand.Rand
}
func (g *generator) NexInt(max int) int {
return g.src.Intn(max)
}
func (g *generator) NextString() (str string) {
strlen := g.src.Intn(6) + 3
numlen := g.src.Intn(3) + 1
numpos := g.src.Intn(strlen + 1)
var num string
for i := 0; i < numlen; i++ {
num += strconv.Itoa(g.src.Intn(10))
}
for i := 0; i < strlen+1; i++ {
if i == numpos {
str += num
} else {
str += string('a' + rune(g.src.Intn(16)))
}
}
return str
}
func testSet(seed int) [][]string {
gen := &generator{
src: rand.New(rand.NewSource(int64(seed))),
}
n := 1000
if testing.Short() {
n = 1
}
set := make([][]string, n)
for i := range set {
strings := make([]string, 10000)
for idx := range strings {
strings[idx] = gen.NextString()
}
set[i] = strings
}
return set
}
4.2 Benchmark
func BenchmarkStdStringSort00(b *testing.B) {
set := testSet(300)
arr := make([]string, len(set[0]))
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, list := range set {
b.StopTimer()
copy(arr, list)
b.StartTimer()
sort.Strings(arr)
}
}
}
func BenchmarkNaturalStringSort01(b *testing.B) {
set := testSet(300)
arr := make([]string, len(set[0]))
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, list := range set {
// Resetting the test set to be unsorted does not count.
b.StopTimer()
copy(arr, list)
b.StartTimer()
sort.Slice(arr, func(i, j int) bool {
return NaturalLess01(arr[i], arr[j])
})
}
}
}
func BenchmarkNaturalStringSort02(b *testing.B) {
set := testSet(300)
arr := make([]string, len(set[0]))
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, list := range set {
// Resetting the test set to be unsorted does not count.
b.StopTimer()
copy(arr, list)
b.StartTimer()
sort.Slice(arr, func(i, j int) bool {
return NaturalLess02(arr[i], arr[j])
})
}
}
}
func BenchmarkNaturalStringSort03(b *testing.B) {
set := testSet(300)
arr := make([]string, len(set[0]))
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, list := range set {
// Resetting the test set to be unsorted does not count.
b.StopTimer()
copy(arr, list)
b.StartTimer()
sort.Slice(arr, func(i, j int) bool {
return NaturalLess03(arr[i], arr[j])
})
}
}
}
Start test:
$ go test -bench '.' -benchmem .
As above, the NaturalLess01
is the most efficient approach.
However, the approach that uses regexp
even timed out 😂.