tpl/collections: Add collections.D using Vitter's Method D for sequential random sampling

2025-09-01 22:42:45 +02:00 · 2025-08-26 20:37:08 +02:00
parent 84dd495f2b
commit 1ba80874e4
8 changed files with 387 additions and 7 deletions
--- a/tpl/collections/collections.go
+++ b/tpl/collections/collections.go
@@ -19,7 +19,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"math/rand"
+	"math/rand/v2"
 	"reflect"
 	"strings"
 	"time"
@@ -41,9 +41,12 @@ func New(deps *deps.Deps) *Namespace {
 	}
 	loc := langs.GetLocation(language)

+	dCache := maps.NewCacheWithOptions[dKey, []int](maps.CacheOptions{Size: 100})
+
 	return &Namespace{
 		loc:      loc,
 		sortComp: compare.New(loc, true),
+		dCache:   dCache,
 		deps:     deps,
 	}
 }
@@ -52,6 +55,7 @@ func New(deps *deps.Deps) *Namespace {
 type Namespace struct {
 	loc      *time.Location
 	sortComp *compare.Namespace
+	dCache   *maps.Cache[dKey, []int]
 	deps     *deps.Deps
 }

@@ -520,6 +524,29 @@ func (ns *Namespace) Slice(args ...any) any {
 	return collections.Slice(args...)
 }

+type dKey struct {
+	seed uint64
+	n    int
+	hi   int
+}
+
+// D returns a slice of n unique random numbers in the range [0, hi) using the provded seed,
+// using  J. S. Vitter's Method D for sequential random sampling, from Vitter, J.S.
+// - An Efficient Algorithm for Sequential Random Sampling - ACM Trans. Math. Software 11 (1985), 37-57.
+// See  https://getkerf.wordpress.com/2016/03/30/the-best-algorithm-no-one-knows-about/
+func (ns *Namespace) D(seed, n, hi any) []int {
+	key := dKey{seed: cast.ToUint64(seed), n: cast.ToInt(n), hi: cast.ToInt(hi)}
+	v, _ := ns.dCache.GetOrCreate(key, func() ([]int, error) {
+		prng := rand.New(rand.NewPCG(key.seed, 0))
+		result := make([]int, 0, key.n)
+		_d(prng, key.n, key.hi, func(i int) {
+			result = append(result, i)
+		})
+		return result, nil
+	})
+	return v
+}
+
 type intersector struct {
 	r    reflect.Value
 	seen map[any]bool
--- a/tpl/collections/collections_test.go
+++ b/tpl/collections/collections_test.go
@@ -788,6 +788,35 @@ func TestUniq(t *testing.T) {
 	}
 }

+func TestD(t *testing.T) {
+	t.Parallel()
+	c := qt.New(t)
+	ns := newNs()
+
+	c.Assert(ns.D(42, 5, 100), qt.DeepEquals, []int{24, 34, 66, 82, 96})
+	c.Assert(ns.D(31, 5, 100), qt.DeepEquals, []int{12, 37, 38, 69, 98})
+}
+
+func BenchmarkD2(b *testing.B) {
+	ns := newNs()
+
+	runBenchmark := func(seed, n, max int) {
+		name := fmt.Sprintf("n=%d,max=%d", n, max)
+		b.Run(name, func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				ns.D(seed, n, max)
+			}
+		})
+	}
+
+	runBenchmark(32, 5, 100)
+	runBenchmark(32, 50, 1000)
+	runBenchmark(32, 10, 10000)
+	runBenchmark(32, 500, 10000)
+	runBenchmark(32, 10, 500000)
+	runBenchmark(32, 5000, 500000)
+}
+
 func (x *TstX) TstRp() string {
 	return "r" + x.A
 }
--- a/tpl/collections/vitter.go
+++ b/tpl/collections/vitter.go
@@ -0,0 +1,149 @@
+// This is just a temporary fork of https://github.com/josharian/vitter (ISC License, https://github.com/josharian/vitter/blob/main/LICENSE)
+//
+// This file will be removed once https://github.com/josharian/vitter/issues/1 is resolved.
+
+package collections
+
+import (
+	"math"
+	"math/rand/v2"
+)
+
+// https://getkerf.wordpress.com/2016/03/30/the-best-algorithm-no-one-knows-about/
+
+// Copyright Kevin Lawler, released under ISC License
+
+// _d generates an in-order uniform random sample of size 'want' from the range [0, max) using the provided PRNG.
+//
+// Parameters:
+//   - prng: random number generator
+//   - want: number of samples to select
+//   - max: upper bound of the range [0, max) from which to sample
+//   - choose: callback function invoked with each selected index in ascending order
+//
+// If the parameters are invalid (want < 0 or want > max), no samples are selected.
+//
+// Vitter, J.S. - An Efficient Algorithm for Sequential Random Sampling - ACM Trans. Math. Software 11 (1985), 37-57.
+func _d(prng *rand.Rand, want, max int, choose func(n int)) {
+	if want <= 0 || want > max {
+		return
+	}
+	// POTENTIAL_OPTIMIZATION_POINT: Christian Neukirchen points out we can replace exp(log(x)*y) by pow(x,y)
+	// POTENTIAL_OPTIMIZATION_POINT: Vitter paper points out an exponentially distributed random var can provide speed ups
+	// 'a' is space allocated for the hand
+	// 'n' is the size of the hand
+	// 'N' is the upper bound on the random card values
+	j := -1
+	qu1 := -want + 1 + max
+	const negalphainv = -13 // threshold parameter from Vitter's paper for algorithm selection
+	threshold := -negalphainv * want
+
+	wantf := float64(want)
+	maxf := float64(max)
+	ninv := 1.0 / wantf
+	var nmin1inv float64
+	Vprime := math.Exp(math.Log(prng.Float64()) * ninv)
+
+	qu1real := -wantf + 1.0 + maxf
+	var U, X, y1, y2, top, bottom, negSreal float64
+
+	for want > 1 && threshold < max {
+		var S int
+
+		nmin1inv = 1.0 / (-1.0 + wantf)
+
+		for {
+			for {
+				X = maxf * (-Vprime + 1.0)
+				S = int(math.Floor(X))
+
+				if S < qu1 {
+					break
+				}
+
+				Vprime = math.Exp(math.Log(prng.Float64()) * ninv)
+			}
+
+			U = prng.Float64()
+			negSreal = float64(-S)
+			y1 = math.Exp(math.Log(U*maxf/qu1real) * nmin1inv)
+			Vprime = y1 * (-X/maxf + 1.0) * (qu1real / (negSreal + qu1real))
+
+			if Vprime <= 1.0 {
+				break
+			}
+
+			y2 = 1.0
+			top = -1.0 + maxf
+			var limit int
+
+			if -1+want > S {
+				bottom = -wantf + maxf
+				limit = -S + max
+			} else {
+				bottom = -1.0 + negSreal + maxf
+				limit = qu1
+			}
+
+			for t := max - 1; t >= limit; t-- {
+				y2 = (y2 * top) / bottom
+				top--
+				bottom--
+			}
+
+			if maxf/(-X+maxf) >= y1*math.Exp(math.Log(y2)*nmin1inv) {
+				Vprime = math.Exp(math.Log(prng.Float64()) * nmin1inv)
+				break
+			}
+
+			Vprime = math.Exp(math.Log(prng.Float64()) * ninv)
+		}
+
+		j += S + 1
+
+		choose(j)
+
+		max = -S + (-1 + max)
+		maxf = negSreal + (-1.0 + maxf)
+		want--
+		wantf--
+		ninv = nmin1inv
+
+		qu1 = -S + qu1
+		qu1real = negSreal + qu1real
+
+		threshold += negalphainv
+	}
+
+	if want > 1 {
+		methodA(prng, want, max, j, choose) // if i>0 then n has been decremented
+	} else {
+		S := int(math.Floor(float64(max) * Vprime))
+
+		j += S + 1
+
+		choose(j)
+	}
+}
+
+// methodA is the simpler fallback algorithm used when Algorithm D's optimizations are not beneficial.
+func methodA(prng *rand.Rand, want, max int, j int, choose func(n int)) {
+	for want >= 2 {
+		j++
+		V := prng.Float64()
+		quot := float64(max-want) / float64(max)
+		for quot > V {
+			j++
+			max--
+			quot *= float64(max - want)
+			quot /= float64(max)
+		}
+		choose(j)
+		max--
+		want--
+	}
+
+	S := int(math.Floor(float64(max) * prng.Float64()))
+	j += S + 1
+	choose(j)
+}
--- a/tpl/collections/vitter_test.go
+++ b/tpl/collections/vitter_test.go
@@ -0,0 +1,95 @@
+// This is just a temporary fork of https://github.com/josharian/vitter (ISC License, https://github.com/josharian/vitter/blob/main/LICENSE)
+//
+// This file will be removed once https://github.com/josharian/vitter/issues/1 is resolved.
+
+package collections
+
+import (
+	"fmt"
+	"math/rand/v2"
+	"reflect"
+	"testing"
+	"time"
+)
+
+var goldenTests = []struct {
+	seed   int64
+	k, max int
+	want   []int
+}{
+	{2, 10, 100, []int{6, 20, 34, 45, 58, 59, 64, 69, 70, 72}},
+	{3, 10, 100, []int{8, 11, 22, 26, 30, 40, 74, 76, 93, 95}},
+	{4, 5, 1000, []int{183, 283, 443, 501, 531}},
+	{5, 15, 100000, []int{12984, 17778, 20370, 23830, 27120, 33258, 45718, 50064, 57096, 58580, 80960, 84396, 84594, 95561, 97687}},
+}
+
+func TestGolden(t *testing.T) {
+	for _, test := range goldenTests {
+		prng := rand.New(rand.NewPCG(uint64(test.seed), 0))
+		var got []int
+		testD(prng, t, test.k, test.max, func(n int) {
+			got = append(got, n)
+		})
+		if !reflect.DeepEqual(got, test.want) {
+			t.Errorf("golden(%d, %d, %d) = %#v want %#v", test.seed, test.k, test.max, got, test.want)
+		}
+	}
+}
+
+func TestInspectCounts(t *testing.T) {
+	prng := rand.New(rand.NewPCG(uint64(time.Now().UnixNano()), uint64(time.Now().UnixNano())))
+	const max = 100
+	const k = 10
+	const iters = 10000
+	counts := make([]int, max)
+	for i := 0; i < iters; i++ {
+		testD(prng, t, k, max, func(n int) {
+			counts[n]++
+		})
+	}
+	for i := range counts {
+		counts[i] -= (iters * k / max)
+	}
+	t.Log(counts)
+}
+
+func testD(prng *rand.Rand, tb testing.TB, want, max int, choose func(n int)) {
+	prev := -1
+	got := want
+	_d(prng, want, max, func(x int) {
+		if x <= prev {
+			tb.Fatalf("backwards: %d then %d", prev, x)
+		}
+		if x < 0 || x >= max {
+			tb.Fatalf("bad selection: %d", x)
+		}
+		prev = x
+		got--
+		if got < 0 {
+			tb.Fatal("choose called too many times")
+		}
+		choose(x)
+	})
+	if got != 0 {
+		tb.Fatal("choose not called enough times")
+	}
+}
+
+func TestWantIsMax(t *testing.T) {
+	// Ensure that when want == max, we get all indices.
+	prng := rand.New(rand.NewPCG(uint64(time.Now().UnixNano()), uint64(time.Now().UnixNano())))
+	const n = 10000
+	testD(prng, t, n, n, func(n int) {})
+}
+
+func BenchmarkD(b *testing.B) {
+	prng := rand.New(rand.NewPCG(uint64(time.Now().UnixNano()), uint64(time.Now().UnixNano())))
+	// TODO: count rng calls?
+	for _, want := range []int{1, 100, 10000} {
+		b.Run(fmt.Sprintf("want=%d", want), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				_d(prng, want, 1000000, func(int) {})
+			}
+		})
+	}
+}