How to sort file names naturally

Kesa...大约 4 分钟golangsort

1. Problem

I recently wrote a tool gomdtocopen in new window to generate a Table of Contents for my notes directory. In order to sort the file names, I used the sort.Strings().

But it not working as I expected, the problem is: it sorts the names alphabetically.

Suppose we have a list like this:

ab abc1 abc01 abc2 abc5 abc10

After sorting by sort.Strings()

ab abc01 abc1 abc10 abc2 abc5

The abc2 should be placed before abc10, this is not expected, I want it sorted naturally.

2. Natual Sort Order

Natual Sort Orderopen in new window

In computing, natural sort order (or natural sorting) is the ordering of stringsopen in new window in alphabetical orderopen in new window, except that multi-digit numbers are treated atomically, i.e., as if they were a single character. Natural sort order has been promoted as being more human-friendly ("natural") than machine-oriented, pure alphabetical sort order.

3. Implementations

For the full code see https://github.com/dreamjz/golang-notes/blob/main/blog/natural-sort/natural_sort.goopen in new window

3.1 Using regular expression to separate number/string

The first thing that comes to my mind is using regexp to separate number and string.

func NaturalLess03(str1, str2 string) bool {
	pStr1 := processWithRegexp(str1)
	pStr2 := processWithRegexp(str2)
	minLen := int(math.Min(float64(len(pStr1)), float64(len(pStr2))))
	for i := 0; i < minLen; i++ {
		e1 := pStr1[i]
		e2 := pStr2[i]
		if e1 == e2 {
			continue
		}
		ei1, err1 := strconv.Atoi(e1)
		ei2, err2 := strconv.Atoi(e2)

		switch {
		case err1 == nil && err2 == nil && ei1 != ei2:
			return ei1 < ei2
		case err1 != nil && err2 != nil:
			return e1 < e2
		case err1 != nil:
			return true
		case err2 != nil:
			return false
		}
	}
	return len(str1) < len(str2)
}

func processWithRegexp(str string) []string {
	reg := regexp.MustCompile(`\d+|\D+`)
	return reg.FindAllString(str, -1)
}

3.2 Using pointer/index to separate number/string

The approach is the same as above, but we use the index to separate number and string.

func NaturalLess02(str1, str2 string) bool {
	pStr1 := processStr(str1)
	pStr2 := processStr(str2)
	minLen := int(math.Min(float64(len(pStr1)), float64(len(pStr2))))
	for i := 0; i < minLen; i++ {
		e1 := pStr1[i]
		e2 := pStr2[i]
		if e1 == e2 {
			continue
		}
		ei1, err1 := strconv.Atoi(e1)
		ei2, err2 := strconv.Atoi(e2)

		switch {
		case err1 == nil && err2 == nil && ei1 != ei2:
			return ei1 < ei2
		case err1 != nil && err2 != nil:
			return e1 < e2
		case err1 != nil:
			return true
		case err2 != nil:
			return false
		}
	}
	return len(str1) < len(str2)
}

func processStr(str string) []string {
	var processedStr []string
	for i := 0; i < len(str); {
		j := i + 1

		for ; j < len(str); j++ {
			if isDigit(str[i]) != isDigit(str[j]) {
				break
			}
		}
		processedStr = append(processedStr, str[i:j])

		i = j
	}
	return processedStr
}

func isDigit(b byte) bool {
	return '0' <= b && b <= '9'
}

This approach compares rune-by-rune, without separating numbers or strings.

Code from https://github.com/fvbommel/sortorder/blob/main/natsort.goopen in new window

func NaturalLess01(str1, str2 string) bool {
	idx1, idx2 := 0, 0
	for idx1 < len(str1) && idx2 < len(str2) {
		c1, c2 := str1[idx1], str2[idx2]
		dig1, dig2 := isDigit(c1), isDigit(c2)

		switch {
		case dig1 != dig2: // Digits before other characters
			return dig1
		case !dig1: // && !dig2
			if c1 != c2 {
				return c1 < c2
			}
			idx1++
			idx2++
		default: // Digits
            // Eat zeros
			for ; idx1 < len(str1) && str1[idx1] == '0'; idx1++ {
			}
			for ; idx2 < len(str2) && str2[idx2] == '0'; idx2++ {
			}
            // Eat all digits
			nonZero1, nonZero2 := idx1, idx2
			for ; idx1 < len(str1) && isDigit(str1[idx1]); idx1++ {
			}
			for ; idx2 < len(str2) && isDigit(str2[idx2]); idx2++ {
			}
            // If lengths of numbers with non-zero prefix differ, the shorter
			// one is less.
			if len1, len2 := idx1-nonZero1, idx2-nonZero2; len1 != len2 {
				return len1 < len2
			}
            // If they're equally long, string comparison is correct.
			if nr1, nr2 := str1[nonZero1:idx1], str2[nonZero2:idx2]; nr1 != nr2 {
				return nr1 < nr2
			}
            // Otherwise, the one with less zeros is less.
			// Because everything up to the number is equal, comparing the index
			// after the zeros is sufficient.
			if nonZero1 != nonZero2 {
				return nonZero1 < nonZero2
			}
		}
	}
    	// So far they are identical. At least one is ended. If the other continues,
	// it sorts last.
	return len(str1) < len(str2)
}

3.4 Simple test

const PrintFormat = "%-25s: %v\n"
func main() {
	strs := []string{
		"ab", "abc1",
		"abc01", "abc2",
		"abc5", "abc10",
	}
	fmt.Printf(PrintFormat, "Origin", strs)
	var strs00, strs01, strs02, strs03 []string
	strs00 = make([]string, len(strs))
	strs01 = make([]string, len(strs))
	strs02 = make([]string, len(strs))
	strs03 = make([]string, len(strs))

	copy(strs00, strs)
	sort.Strings(strs00)
	fmt.Printf(PrintFormat, "Sort by stdsorter", strs00)

	copy(strs01, strs)
	sort.Slice(strs01, func(i, j int) bool {
		return NaturalLess01(strs01[i], strs01[j])
	})
	fmt.Printf(PrintFormat, "Sort by NaturalLess01", strs01)

	copy(strs02, strs)
	sort.Slice(strs02, func(i, j int) bool {
		return NaturalLess02(strs02[i], strs02[j])
	})
	fmt.Printf(PrintFormat, "Sort by NaturalLess02", strs02)

	copy(strs03, strs)
	sort.Slice(strs03, func(i, j int) bool {
		return NaturalLess03(strs03[i], strs03[j])
	})
	fmt.Printf(PrintFormat, "Sort by NaturalLess03", strs03)
}
Origin                   : [ab abc1 abc01 abc2 abc5 abc10]
Sort by stdsorter        : [ab abc01 abc1 abc10 abc2 abc5]
Sort by NaturalLess01    : [ab abc1 abc01 abc2 abc5 abc10]
Sort by NaturalLess02    : [ab abc1 abc01 abc2 abc5 abc10]
Sort by NaturalLess03    : [ab abc1 abc01 abc2 abc5 abc10]

4. Benchmark

4.1 Generate test set

type generator struct {
	src *rand.Rand
}

func (g *generator) NexInt(max int) int {
	return g.src.Intn(max)
}

func (g *generator) NextString() (str string) {
	strlen := g.src.Intn(6) + 3
	numlen := g.src.Intn(3) + 1
	numpos := g.src.Intn(strlen + 1)
	var num string
	for i := 0; i < numlen; i++ {
		num += strconv.Itoa(g.src.Intn(10))
	}
	for i := 0; i < strlen+1; i++ {
		if i == numpos {
			str += num
		} else {
			str += string('a' + rune(g.src.Intn(16)))
		}
	}
	return str
}

func testSet(seed int) [][]string {
	gen := &generator{
		src: rand.New(rand.NewSource(int64(seed))),
	}
	n := 1000
	if testing.Short() {
		n = 1
	}
	set := make([][]string, n)
	for i := range set {
		strings := make([]string, 10000)
		for idx := range strings {
			strings[idx] = gen.NextString()
		}
		set[i] = strings
	}
	return set
}

4.2 Benchmark

func BenchmarkStdStringSort00(b *testing.B) {
	set := testSet(300)
	arr := make([]string, len(set[0]))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for _, list := range set {
			b.StopTimer()
			copy(arr, list)
			b.StartTimer()
			sort.Strings(arr)
		}
	}
}

func BenchmarkNaturalStringSort01(b *testing.B) {
	set := testSet(300)
	arr := make([]string, len(set[0]))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for _, list := range set {
			// Resetting the test set to be unsorted does not count.
			b.StopTimer()
			copy(arr, list)
			b.StartTimer()

			sort.Slice(arr, func(i, j int) bool {
				return NaturalLess01(arr[i], arr[j])
			})
		}
	}
}

func BenchmarkNaturalStringSort02(b *testing.B) {
	set := testSet(300)
	arr := make([]string, len(set[0]))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for _, list := range set {
			// Resetting the test set to be unsorted does not count.
			b.StopTimer()
			copy(arr, list)
			b.StartTimer()

			sort.Slice(arr, func(i, j int) bool {
				return NaturalLess02(arr[i], arr[j])
			})
		}
	}
}
func BenchmarkNaturalStringSort03(b *testing.B) {
	set := testSet(300)
	arr := make([]string, len(set[0]))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for _, list := range set {
			// Resetting the test set to be unsorted does not count.
			b.StopTimer()
			copy(arr, list)
			b.StartTimer()

			sort.Slice(arr, func(i, j int) bool {
				return NaturalLess03(arr[i], arr[j])
			})
		}
	}
}

Start test:

$ go test -bench '.' -benchmem .
image-20230514102501124
image-20230514102501124

As above, the NaturalLess01 is the most efficient approach.

However, the approach that uses regexp even timed out 😂.

Reference

  1. https://github.com/fvbommel/sortorder/blob/main/natsort.goopen in new window
  2. https://github.com/facette/natsort/blob/master/natsort.goopen in new window
  3. https://blog.miigon.net/posts/how-to-sort-file-names-naturally/open in new window
  4. https://en.wikipedia.org/wiki/Natural_sort_orderopen in new window
  5. https://groups.google.com/g/golang-nuts/c/Puy4TQ20G2wopen in new window
上次编辑于:
评论
  • 按正序
  • 按倒序
  • 按热度
Powered by Waline v2.15.2