From c8f3c558d5e3c807b1055d13d13b588e0f04a5b8 Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Mon, 22 Jul 2024 15:01:50 +0530 Subject: [PATCH 01/16] chore: refactor dedup package --- mocks/services/dedup/mock_dedup.go | 4 +- processor/processor.go | 3 +- services/dedup/{ => badger}/badger.go | 56 ++++++++++++++++++++---- services/dedup/dedup.go | 62 ++++----------------------- services/dedup/dedup_test.go | 23 +++++----- services/dedup/types/types.go | 6 +++ 6 files changed, 79 insertions(+), 75 deletions(-) rename services/dedup/{ => badger}/badger.go (61%) create mode 100644 services/dedup/types/types.go diff --git a/mocks/services/dedup/mock_dedup.go b/mocks/services/dedup/mock_dedup.go index 44f61fd905..0dc3e27d35 100644 --- a/mocks/services/dedup/mock_dedup.go +++ b/mocks/services/dedup/mock_dedup.go @@ -8,7 +8,7 @@ import ( reflect "reflect" gomock "github.com/golang/mock/gomock" - dedup "github.com/rudderlabs/rudder-server/services/dedup" + types "github.com/rudderlabs/rudder-server/services/dedup/types" ) // MockDedup is a mock of Dedup interface. @@ -61,7 +61,7 @@ func (mr *MockDedupMockRecorder) Commit(arg0 interface{}) *gomock.Call { } // Set mocks base method. -func (m *MockDedup) Set(arg0 dedup.KeyValue) (bool, int64) { +func (m *MockDedup) Set(arg0 types.KeyValue) (bool, int64) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "Set", arg0) ret0, _ := ret[0].(bool) diff --git a/processor/processor.go b/processor/processor.go index f25f66bf8e..cd04f9da6c 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -44,6 +44,7 @@ import ( destinationdebugger "github.com/rudderlabs/rudder-server/services/debugger/destination" transformationdebugger "github.com/rudderlabs/rudder-server/services/debugger/transformation" "github.com/rudderlabs/rudder-server/services/dedup" + dedupTypes "github.com/rudderlabs/rudder-server/services/dedup/types" "github.com/rudderlabs/rudder-server/services/fileuploader" "github.com/rudderlabs/rudder-server/services/rmetrics" "github.com/rudderlabs/rudder-server/services/rsources" @@ -1708,7 +1709,7 @@ func (proc *Handle) processJobsForDest(partition string, subJobs subJob) *transf p := payloadFunc() messageSize := int64(len(p)) dedupKey := fmt.Sprintf("%v%v", messageId, eventParams.SourceJobRunId) - if ok, previousSize := proc.dedup.Set(dedup.KeyValue{Key: dedupKey, Value: messageSize}); !ok { + if ok, previousSize := proc.dedup.Set(dedupTypes.KeyValue{Key: dedupKey, Value: messageSize}); !ok { proc.logger.Debugf("Dropping event with duplicate dedupKey: %s", dedupKey) sourceDupStats[dupStatKey{sourceID: source.ID, equalSize: messageSize == previousSize}] += 1 continue diff --git a/services/dedup/badger.go b/services/dedup/badger/badger.go similarity index 61% rename from services/dedup/badger.go rename to services/dedup/badger/badger.go index 37de57a97b..aa2bfd3764 100644 --- a/services/dedup/badger.go +++ b/services/dedup/badger/badger.go @@ -1,20 +1,22 @@ -package dedup +package badger import ( "strconv" "time" "github.com/dgraph-io/badger/v4" + "github.com/dgraph-io/badger/v4/options" "github.com/rudderlabs/rudder-go-kit/config" - + "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" "github.com/rudderlabs/rudder-server/rruntime" + "github.com/rudderlabs/rudder-server/services/dedup/types" "github.com/rudderlabs/rudder-server/utils/misc" ) -type badgerDB struct { +type BadgerDB struct { stats stats.Stats logger loggerForBadger badgerDB *badger.DB @@ -25,7 +27,37 @@ type badgerDB struct { opts badger.Options } -func (d *badgerDB) Get(key string) (int64, bool) { +func NewBadgerDB(path string) *BadgerDB { + dedupWindow := config.GetReloadableDurationVar(3600, time.Second, "Dedup.dedupWindow", "Dedup.dedupWindowInS") + + log := logger.NewLogger().Child("dedup") + badgerOpts := badger. + DefaultOptions(path). + WithCompression(options.None). + WithIndexCacheSize(16 << 20). // 16mb + WithNumGoroutines(1). + WithNumMemtables(config.GetInt("BadgerDB.numMemtable", 5)). + WithValueThreshold(config.GetInt64("BadgerDB.valueThreshold", 1048576)). + WithBlockCacheSize(0). + WithNumVersionsToKeep(1). + WithNumLevelZeroTables(config.GetInt("BadgerDB.numLevelZeroTables", 5)). + WithNumLevelZeroTablesStall(config.GetInt("BadgerDB.numLevelZeroTablesStall", 15)). + WithSyncWrites(config.GetBool("BadgerDB.syncWrites", false)). + WithDetectConflicts(config.GetBool("BadgerDB.detectConflicts", false)) + + db := &BadgerDB{ + stats: stats.Default, + logger: loggerForBadger{log}, + path: path, + gcDone: make(chan struct{}), + close: make(chan struct{}), + window: dedupWindow, + opts: badgerOpts, + } + return db +} + +func (d *BadgerDB) Get(key string) (int64, bool) { var payloadSize int64 var found bool err := d.badgerDB.View(func(txn *badger.Txn) error { @@ -45,7 +77,7 @@ func (d *badgerDB) Get(key string) (int64, bool) { return payloadSize, found } -func (d *badgerDB) Set(kvs []KeyValue) error { +func (d *BadgerDB) Set(kvs []types.KeyValue) error { txn := d.badgerDB.NewTransaction(true) for _, message := range kvs { value := strconv.FormatInt(message.Value, 10) @@ -66,13 +98,13 @@ func (d *badgerDB) Set(kvs []KeyValue) error { return txn.Commit() } -func (d *badgerDB) Close() { +func (d *BadgerDB) Close() { close(d.close) <-d.gcDone _ = d.badgerDB.Close() } -func (d *badgerDB) start() { +func (d *BadgerDB) Start() { var err error d.badgerDB, err = badger.Open(d.opts) @@ -85,7 +117,7 @@ func (d *badgerDB) start() { }) } -func (d *badgerDB) gcLoop() { +func (d *BadgerDB) gcLoop() { for { select { case <-d.close: @@ -113,3 +145,11 @@ func (d *badgerDB) gcLoop() { } } + +type loggerForBadger struct { + logger.Logger +} + +func (l loggerForBadger) Warningf(fmt string, args ...interface{}) { + l.Warnf(fmt, args...) +} diff --git a/services/dedup/dedup.go b/services/dedup/dedup.go index 3fbc8f2d2f..3ddf6ddd77 100644 --- a/services/dedup/dedup.go +++ b/services/dedup/dedup.go @@ -5,19 +5,12 @@ package dedup import ( "fmt" "sync" - "time" - "github.com/dgraph-io/badger/v4" - "github.com/dgraph-io/badger/v4/options" - - "github.com/rudderlabs/rudder-go-kit/config" - "github.com/rudderlabs/rudder-go-kit/logger" - "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-server/services/dedup/badger" + "github.com/rudderlabs/rudder-server/services/dedup/types" "github.com/rudderlabs/rudder-server/utils/misc" ) -type OptFn func(*badgerDB) - // DefaultPath returns the default path for the deduplication service's badger DB func DefaultPath() string { badgerPathName := "/badgerdbv4" @@ -30,33 +23,8 @@ func DefaultPath() string { // New creates a new deduplication service. The service needs to be closed after use. func New(path string) Dedup { - dedupWindow := config.GetReloadableDurationVar(3600, time.Second, "Dedup.dedupWindow", "Dedup.dedupWindowInS") - - log := logger.NewLogger().Child("dedup") - badgerOpts := badger. - DefaultOptions(path). - WithCompression(options.None). - WithIndexCacheSize(16 << 20). // 16mb - WithNumGoroutines(1). - WithNumMemtables(config.GetInt("BadgerDB.numMemtable", 5)). - WithValueThreshold(config.GetInt64("BadgerDB.valueThreshold", 1048576)). - WithBlockCacheSize(0). - WithNumVersionsToKeep(1). - WithNumLevelZeroTables(config.GetInt("BadgerDB.numLevelZeroTables", 5)). - WithNumLevelZeroTablesStall(config.GetInt("BadgerDB.numLevelZeroTablesStall", 15)). - WithSyncWrites(config.GetBool("BadgerDB.syncWrites", false)). - WithDetectConflicts(config.GetBool("BadgerDB.detectConflicts", false)) - - db := &badgerDB{ - stats: stats.Default, - logger: loggerForBadger{log}, - path: path, - gcDone: make(chan struct{}), - close: make(chan struct{}), - window: dedupWindow, - opts: badgerOpts, - } - db.start() + db := badger.NewBadgerDB(path) + db.Start() return &dedup{ badgerDB: db, cache: make(map[string]int64), @@ -66,7 +34,7 @@ func New(path string) Dedup { // Dedup is the interface for deduplication service type Dedup interface { // Set returns [true] if it was the first time the key was encountered, otherwise it returns [false] along with the previous value - Set(kv KeyValue) (bool, int64) + Set(kv types.KeyValue) (bool, int64) // Commit commits a list of previously set keys to the DB Commit(keys []string) error @@ -74,18 +42,14 @@ type Dedup interface { // Close closes the deduplication service Close() } -type KeyValue struct { - Key string - Value int64 -} type dedup struct { - badgerDB *badgerDB + badgerDB *badger.BadgerDB cacheMu sync.Mutex cache map[string]int64 } -func (d *dedup) Set(kv KeyValue) (bool, int64) { +func (d *dedup) Set(kv types.KeyValue) (bool, int64) { d.cacheMu.Lock() defer d.cacheMu.Unlock() if previous, found := d.cache[kv.Key]; found { @@ -102,13 +66,13 @@ func (d *dedup) Commit(keys []string) error { d.cacheMu.Lock() defer d.cacheMu.Unlock() - kvs := make([]KeyValue, len(keys)) + kvs := make([]types.KeyValue, len(keys)) for i, key := range keys { value, ok := d.cache[key] if !ok { return fmt.Errorf("key %v has not been previously set", key) } - kvs[i] = KeyValue{Key: key, Value: value} + kvs[i] = types.KeyValue{Key: key, Value: value} } err := d.badgerDB.Set(kvs) @@ -123,11 +87,3 @@ func (d *dedup) Commit(keys []string) error { func (d *dedup) Close() { d.badgerDB.Close() } - -type loggerForBadger struct { - logger.Logger -} - -func (l loggerForBadger) Warningf(fmt string, args ...interface{}) { - l.Warnf(fmt, args...) -} diff --git a/services/dedup/dedup_test.go b/services/dedup/dedup_test.go index 51f87e2a56..b791fba581 100644 --- a/services/dedup/dedup_test.go +++ b/services/dedup/dedup_test.go @@ -16,6 +16,7 @@ import ( "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-server/services/dedup" + "github.com/rudderlabs/rudder-server/services/dedup/types" ) func Test_Dedup(t *testing.T) { @@ -30,29 +31,29 @@ func Test_Dedup(t *testing.T) { defer d.Close() t.Run("if message id is not present in cache and badger db", func(t *testing.T) { - found, _ := d.Set(dedup.KeyValue{Key: "a", Value: 1}) + found, _ := d.Set(types.KeyValue{Key: "a", Value: 1}) require.Equal(t, true, found) // Checking it again should give us the previous value from the cache - found, value := d.Set(dedup.KeyValue{Key: "a", Value: 2}) + found, value := d.Set(types.KeyValue{Key: "a", Value: 2}) require.Equal(t, false, found) require.Equal(t, int64(1), value) }) t.Run("if message is committed, previous value should always return", func(t *testing.T) { - found, _ := d.Set(dedup.KeyValue{Key: "b", Value: 1}) + found, _ := d.Set(types.KeyValue{Key: "b", Value: 1}) require.Equal(t, true, found) err := d.Commit([]string{"a"}) require.NoError(t, err) - found, value := d.Set(dedup.KeyValue{Key: "b", Value: 2}) + found, value := d.Set(types.KeyValue{Key: "b", Value: 2}) require.Equal(t, false, found) require.Equal(t, int64(1), value) }) t.Run("committing a messageid not present in cache", func(t *testing.T) { - found, _ := d.Set(dedup.KeyValue{Key: "c", Value: 1}) + found, _ := d.Set(types.KeyValue{Key: "c", Value: 1}) require.Equal(t, true, found) err := d.Commit([]string{"d"}) @@ -72,17 +73,17 @@ func Test_Dedup_Window(t *testing.T) { d := dedup.New(dbPath) defer d.Close() - found, _ := d.Set(dedup.KeyValue{Key: "to be deleted", Value: 1}) + found, _ := d.Set(types.KeyValue{Key: "to be deleted", Value: 1}) require.Equal(t, true, found) err := d.Commit([]string{"to be deleted"}) require.NoError(t, err) - found, _ = d.Set(dedup.KeyValue{Key: "to be deleted", Value: 2}) + found, _ = d.Set(types.KeyValue{Key: "to be deleted", Value: 2}) require.Equal(t, false, found) require.Eventually(t, func() bool { - found, _ = d.Set(dedup.KeyValue{Key: "to be deleted", Value: 3}) + found, _ = d.Set(types.KeyValue{Key: "to be deleted", Value: 3}) return found }, 2*time.Second, 100*time.Millisecond) } @@ -102,7 +103,7 @@ func Test_Dedup_ErrTxnTooBig(t *testing.T) { messages := make([]string, size) for i := 0; i < size; i++ { messages[i] = uuid.New().String() - d.Set(dedup.KeyValue{Key: messages[i], Value: int64(i + 1)}) + d.Set(types.KeyValue{Key: messages[i], Value: int64(i + 1)}) } err := d.Commit(messages) require.NoError(t, err) @@ -122,11 +123,11 @@ func Benchmark_Dedup(b *testing.B) { b.Run("no duplicates 1000 batch unique", func(b *testing.B) { batchSize := 1000 - msgIDs := make([]dedup.KeyValue, batchSize) + msgIDs := make([]types.KeyValue, batchSize) keys := make([]string, 0) for i := 0; i < b.N; i++ { - msgIDs[i%batchSize] = dedup.KeyValue{ + msgIDs[i%batchSize] = types.KeyValue{ Key: uuid.New().String(), Value: int64(i + 1), } diff --git a/services/dedup/types/types.go b/services/dedup/types/types.go new file mode 100644 index 0000000000..ab8b351afc --- /dev/null +++ b/services/dedup/types/types.go @@ -0,0 +1,6 @@ +package types + +type KeyValue struct { + Key string + Value int64 +} From 20d740d8f2f53c98aab363b4fe1758595082188b Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Tue, 23 Jul 2024 17:22:03 +0530 Subject: [PATCH 02/16] feat: introduce scylla dedup --- go.mod | 3 + go.sum | 8 + services/dedup/scylla/scylla.go | 386 ++++++++++++++++++++++++++++++++ 3 files changed, 397 insertions(+) create mode 100644 services/dedup/scylla/scylla.go diff --git a/go.mod b/go.mod index ef5574c2b5..93e635c79c 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ replace ( github.com/cyphar/filepath-securejoin => github.com/cyphar/filepath-securejoin v0.2.5 github.com/gin-gonic/gin => github.com/gin-gonic/gin v1.10.0 github.com/go-jose/go-jose/v3 => github.com/go-jose/go-jose/v3 v3.0.3 + github.com/gocql/gocql => github.com/scylladb/gocql v1.14.2 github.com/opencontainers/runc => github.com/opencontainers/runc v1.1.12 github.com/satori/go.uuid => github.com/satori/go.uuid v1.2.0 github.com/xitongsys/parquet-go => github.com/rudderlabs/parquet-go v0.0.2 @@ -45,6 +46,7 @@ require ( github.com/go-chi/chi/v5 v5.1.0 github.com/go-redis/redis v6.15.9+incompatible github.com/go-redis/redis/v8 v8.11.5 + github.com/gocql/gocql v1.14.2 github.com/golang-migrate/migrate/v4 v4.17.1 github.com/golang/mock v1.6.0 github.com/gomodule/redigo v1.9.2 @@ -115,6 +117,7 @@ require ( require ( github.com/go-ini/ini v1.67.0 // indirect + github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed // indirect github.com/hamba/avro/v2 v2.22.2-0.20240625062549-66aad10411d9 // indirect ) diff --git a/go.sum b/go.sum index a57350762b..d0f6c9464a 100644 --- a/go.sum +++ b/go.sum @@ -317,12 +317,16 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bitfield/gotestdox v0.2.2 h1:x6RcPAbBbErKLnapz1QeAlf3ospg8efBsedU93CDsnE= github.com/bitfield/gotestdox v0.2.2/go.mod h1:D+gwtS0urjBrzguAkTM2wodsTQYFHdpx8eqRJ3N+9pY= +github.com/bitly/go-hostpool v0.0.0-20171023180738-a3a6125de932 h1:mXoPYz/Ul5HYEDvkta6I8/rnYM5gSdSV2tJ6XbZuEtY= +github.com/bitly/go-hostpool v0.0.0-20171023180738-a3a6125de932/go.mod h1:NOuUCSz6Q9T7+igc/hlvDOUdtWKryOrtFyIVABv/p7k= github.com/bitly/go-simplejson v0.5.1 h1:xgwPbetQScXt1gh9BmoJ6j9JMr3TElvuIyjR8pgdoow= github.com/bitly/go-simplejson v0.5.1/go.mod h1:YOPVLzCfwK14b4Sff3oP1AmGhI9T9Vsg84etUnlyp+Q= github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE= github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bkaradzic/go-lz4 v1.0.0 h1:RXc4wYsyz985CkXXeX04y4VnZFGG8Rd43pRaHsOXAKk= github.com/bkaradzic/go-lz4 v1.0.0/go.mod h1:0YdlkowM3VswSROI7qDxhRvJ3sLhlFrRRwjwegp5jy4= +github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY= +github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4= github.com/bobg/gcsobj v0.1.2/go.mod h1:vS49EQ1A1Ib8FgrL58C8xXYZyOCR2TgzAdopy6/ipa8= github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/bsm/ginkgo/v2 v2.7.0/go.mod h1:AiKlXPm7ItEHNc/2+OkrNG4E0ITzojb9/xWzvQ9XZ9w= @@ -749,6 +753,8 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1 github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= github.com/gsterjov/go-libsecret v0.0.0-20161001094733-a6f4afe4910c h1:6rhixN/i8ZofjG1Y75iExal34USq5p+wiN1tpie8IrU= github.com/gsterjov/go-libsecret v0.0.0-20161001094733-a6f4afe4910c/go.mod h1:NMPJylDgVpX0MLRlPy15sqSwOFv/U1GZ2m21JhFfek0= +github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed h1:5upAirOpQc1Q53c0bnx2ufif5kANL7bfZWcc6VJWJd8= +github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed/go.mod h1:tMWxXQ9wFIaZeTI9F+hmhFiGpFmhOHzyShyFUhRm0H4= github.com/hamba/avro/v2 v2.22.2-0.20240625062549-66aad10411d9 h1:NEoabXt33PDWK4fXryK4e+XX+fSKDmmu9vg3yb9YI2M= github.com/hamba/avro/v2 v2.22.2-0.20240625062549-66aad10411d9/go.mod h1:fQVdB2mFZBhPW1D5Abej41LMvrErARGrrdjOnKbm5yw= github.com/hanwen/go-fuse v1.0.0/go.mod h1:unqXarDXqzAk0rt98O2tVndEPIpUgLD9+rwFisZH3Ok= @@ -1139,6 +1145,8 @@ github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWR github.com/samber/lo v1.46.0 h1:w8G+oaCPgz1PoCJztqymCFaKwXt+5cCXn51uPxExFfQ= github.com/samber/lo v1.46.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= +github.com/scylladb/gocql v1.14.2 h1:IBPtfJFcRDzifCjXYMtrZ14oQ7OqpqQjwITQCwtGZsc= +github.com/scylladb/gocql v1.14.2/go.mod h1:ZLEJ0EVE5JhmtxIW2stgHq/v1P4fWap0qyyXSKyV8K0= github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg= github.com/secure-systems-lab/go-securesystemslib v0.4.0 h1:b23VGrQhTA8cN2CbBw7/FulN9fTtqYUdS5+Oxzt+DUE= github.com/secure-systems-lab/go-securesystemslib v0.4.0/go.mod h1:FGBZgq2tXWICsxWQW1msNf49F0Pf2Op5Htayx335Qbs= diff --git a/services/dedup/scylla/scylla.go b/services/dedup/scylla/scylla.go new file mode 100644 index 0000000000..37dbeef916 --- /dev/null +++ b/services/dedup/scylla/scylla.go @@ -0,0 +1,386 @@ +package scylla + +import ( + "context" + "database/sql" + "fmt" + "math/rand/v2" + "sync" + "sync/atomic" + "time" + + "github.com/gocql/gocql" + "github.com/lib/pq" + _ "github.com/lib/pq" + _ "go.uber.org/automaxprocs" + + "github.com/google/uuid" + "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/profiler" + "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-go-kit/stats/metric" +) + +var ( + keySpace = "scylla" + loadTable = "dedup" + defaultHistogramBuckets = []float64{ + 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 1, 2.5, 5, 10, 60, + } +) + +type DB struct { + pg *sql.DB + scylla *gocql.Session + conf *config.Config + stat stats.Stats + timeout time.Duration + ttl int + sampling float64 +} + +func (d *DB) PopulateBatch() (time.Duration, error) { + startTime := time.Now() + err := d.SetupPGTable() + if err != nil { + return 0, err + } + if err := d.scylla.Query(fmt.Sprintf("CREATE KEYSPACE IF NOT EXISTS %s with replication = {'class': 'SimpleStrategy', 'replication_factor': 3}", keySpace)).Exec(); err != nil { + return 0, err + } + fmt.Println("Created Keyspace") + tableName := fmt.Sprintf("%s.%s", keySpace, loadTable) + if err := d.scylla.Query(fmt.Sprintf( + `CREATE TABLE IF NOT EXISTS %s ( + id uuid PRIMARY KEY) WITH bloom_filter_fp_chance = 0.005 and default_time_to_live = %d;`, tableName, d.ttl, + )).Exec(); err != nil { + return 0, err + } + fmt.Println("Created Table") + totalWorkspaces := d.conf.GetInt("ScyllaDB.totalWorkspaces", 10) + batchSize := d.conf.GetInt("ScyllaDB.batchSize", 20) + pgBatchSize := d.conf.GetInt("DB.pgBatchSize", 10) + sample := int(1/d.sampling) + 1 + pgBatch := make([]string, 0, pgBatchSize) + var totalCount atomic.Int64 + var wg sync.WaitGroup + for i := 0; i < totalWorkspaces; i++ { + fmt.Println("Starting workspace", i) + errorStat := d.stat.NewTaggedStat("scylla_batch_write_error", stats.CountType, stats.Tags{}) + pgErrorStat := d.stat.NewTaggedStat("pg_batch_write_error", stats.CountType, stats.Tags{}) + queryStat := d.stat.NewTaggedStat("scylla_batch_write_stat", stats.TimerType, stats.Tags{}) + countStat := d.stat.NewTaggedStat("scylla_batch_write_count", stats.CountType, stats.Tags{}) + wg.Add(1) + go func() { + defer wg.Done() + totalParallelGoRoutines := d.conf.GetInt("ScyllaDB.totalParallelGoRoutines", 1) + for j := 0; j < totalParallelGoRoutines; j++ { + batch := d.scylla.NewBatch(gocql.LoggedBatch).WithContext(context.Background()) + fmt.Println("Starting parallel go routine", j) + go func() { + for { + time.Sleep(4 * time.Millisecond) + uid := uuid.New().String() + batch.Entries = append(batch.Entries, gocql.BatchEntry{ + Stmt: fmt.Sprintf("INSERT INTO %s.%s (id) VALUES (?)", keySpace, loadTable), + Args: []interface{}{uid}, + }) + totalCount.Add(1) + time := time.Now() + if len(batch.Entries) == batchSize { + if err := d.scylla.ExecuteBatch(batch); err != nil { + fmt.Printf("failed executing batch into scylla with %v and batch size %d", err, batch.Size()) + errorStat.Increment() + continue + } + countStat.Count(len(batch.Entries)) + batch = d.scylla.NewBatch(gocql.LoggedBatch).WithContext(context.Background()) + } + queryStat.Since(time) + if rand.IntN(sample)%sample == 0 { + pgBatch = append(pgBatch, uid) + } + if len(pgBatch) == pgBatchSize { + if err := d.InsertPG(pgBatch); err != nil { + fmt.Printf("Failed inserting into PG with %v", err) + pgErrorStat.Increment() + continue + } + pgBatch = nil + } + } + }() + } + }() + if len(pgBatch) > 0 { + if err := d.InsertPG(pgBatch); err != nil { + return 0, err + } + } + } + wg.Wait() + return time.Since(startTime), nil +} + +func (d *DB) SetupPGTable() error { + ctx, cancel := context.WithTimeout(context.Background(), d.timeout) + defer cancel() + tx, err := d.pg.Begin() + if err != nil { + return err + } + createTableStmt := fmt.Sprintf( + `CREATE TABLE IF NOT EXISTS %s ( + id uuid PRIMARY KEY);`, loadTable, + ) + if _, err := tx.ExecContext(ctx, createTableStmt); err != nil { + _ = tx.Rollback() + return err + } + if err := tx.Commit(); err != nil { + return err + } + return nil +} + +func (d *DB) InsertPG(keys []string) error { + tx, err := d.pg.Begin() + if err != nil { + return err + } + ctx, cancel := context.WithTimeout(context.Background(), d.timeout) + defer cancel() + stmt, err := tx.PrepareContext(ctx, pq.CopyIn(loadTable, "id")) + if err != nil { + return err + } + defer func() { _ = stmt.Close() }() + for _, key := range keys { + if _, err = stmt.ExecContext(ctx, key); err != nil { + return err + } + } + if _, err = stmt.ExecContext(ctx); err != nil { + return err + } + return tx.Commit() +} + +func (d *DB) BenchmarkNewKeys() { + time.Sleep(1 * time.Minute) + totalWorkspaces := d.conf.GetInt("ScyllaDB.totalWorkspaces", 10) + totalParallelGoRoutines := d.conf.GetInt("ScyllaDB.totalParallelGoRoutines", 10) + newKeyStat := d.stat.NewTaggedStat("new_keys_read_stat_count", stats.CountType, stats.Tags{}) + var totalCount atomic.Int64 + wg := sync.WaitGroup{} + newReadErrorStat := d.stat.NewTaggedStat("new_keys_read_error", stats.CountType, stats.Tags{}) + for { + for i := 0; i < totalWorkspaces; i++ { + wg.Add(1) + go func() { + defer wg.Done() + wg1 := sync.WaitGroup{} + for j := 0; j < totalParallelGoRoutines; j++ { + wg1.Add(1) + go func() { + defer wg1.Done() + time.Sleep(400 * time.Millisecond) + queryStart := time.Now() + queryStat := d.stat.NewTaggedStat("new_keys_stat_timer", stats.TimerType, stats.Tags{}) + uid := uuid.New().String() + var key string + err := d.scylla.Query("SELECT id FROM scylla.dedup WHERE id = ?", uid).Scan(&key) + if err != nil && err != gocql.ErrNotFound { + fmt.Println("Error closing iterator with", err) + } + if key != "" { + newReadErrorStat.Increment() + fmt.Println("key exists") + } + queryStat.Since(queryStart) + totalCount.Add(1) + newKeyStat.Increment() + }() + wg1.Wait() + time.Sleep(25 * time.Millisecond) + } + }() + time.Sleep(10 * time.Millisecond) + } + fmt.Println("Total New Keys Read", totalCount.Load()) + } +} + +func (d *DB) BenchmarkExistingKeys() { + time.Sleep(1 * time.Minute) + totalKeysToRead := d.conf.GetInt("ScyllaDB.totalKeysToMigrate", 12100) + sqlQuery := fmt.Sprintf("SELECT id FROM %s limit %d", loadTable, totalKeysToRead) + readKeyStat := d.stat.NewTaggedStat("read_keys_stat", stats.CountType, stats.Tags{}) + var totalCount atomic.Int64 + existingReadErrorStat := d.stat.NewTaggedStat("existing_keys_read_error", stats.CountType, stats.Tags{}) + for { + rows, err := d.pg.Query(sqlQuery) + if err != nil { + existingReadErrorStat.Increment() + continue + } + for rows.Next() { + var id string + if err := rows.Scan(&id); err != nil { + fmt.Println("Error scanning row", err) + existingReadErrorStat.Increment() + continue + } + var key string + queryStart := time.Now() + queryStat := d.stat.NewTaggedStat("existing_keys_stat", stats.TimerType, stats.Tags{}) + + err := d.scylla.Query("SELECT id FROM scylla.dedup WHERE id = ?", id).Scan(&key) + if err == gocql.ErrNotFound { + fmt.Println("key does not exist", id) + existingReadErrorStat.Increment() + continue + } + if err != nil { + fmt.Println("Error getting Existing release ", err) + existingReadErrorStat.Increment() + continue + } + if key == "" { + existingReadErrorStat.Increment() + fmt.Println("key does not exist", id) + } + queryStat.Since(queryStart) + totalCount.Add(1) + readKeyStat.Increment() + } + _ = rows.Close() + if totalCount.Load() == 0 { + time.Sleep(2 * time.Minute) + } + fmt.Println("Total Existing Keys Read", totalCount.Load()) + } +} + +func (d *DB) Destroy() (time.Duration, error) { + startTime := time.Now() + err := d.scylla.Query(fmt.Sprintf("DROP TABLE %s.%s", keySpace, loadTable)).Exec() + if err != nil { + return 0, err + } + err = d.scylla.Query(fmt.Sprintf("DROP KEYSPACE %s", keySpace)).Exec() + if err != nil { + return 0, err + } + return time.Since(startTime), nil +} + +func Start(runningMode string) error { + s, err := New() + if err != nil { + return err + } + switch runningMode { + case "populate-batch": + _, err := s.PopulateBatch() + fmt.Println("Error while populating batch: ", err) + case "benchmark-existing-keys": + s.BenchmarkExistingKeys() + case "benchmark-new-keys": + s.BenchmarkNewKeys() + case "populate-and-benchmark": + wg := sync.WaitGroup{} + wg.Add(3) + go func() { + _, err := s.PopulateBatch() + fmt.Println("Error while populating batch: ", err) + wg.Done() + }() + go func() { + s.BenchmarkExistingKeys() + wg.Done() + }() + go func() { + s.BenchmarkNewKeys() + wg.Done() + }() + wg.Wait() + default: + _, err := s.Destroy() + fmt.Printf("Error while destroying: %v", err) + } + return nil +} + +func New() (*DB, error) { + conf := config.New(config.WithEnvPrefix("BENCHMARK")) + statsOptions := stats.WithDefaultHistogramBuckets(defaultHistogramBuckets) + stats.Default = stats.NewStats(conf, logger.NewFactory(conf), metric.NewManager(), statsOptions) + err := stats.Default.Start(context.TODO(), GoRoutineFactory) + if err != nil { + return nil, err + } + cluster := gocql.NewCluster(conf.GetString("Scylla.Hosts", "localhost:9042")) + cluster.Consistency = gocql.Quorum + + session, err := cluster.CreateSession() + if err != nil { + return nil, err + } + db, err := sql.Open("postgres", getConnectionString(conf)) + if err != nil { + return nil, err + } + + go func() { + _ = profiler.StartServer(context.TODO(), conf.GetInt("Profiler.Port", 7777)) + }() + + return &DB{ + pg: db, + scylla: session, + conf: conf, + stat: stats.Default, + timeout: conf.GetDuration("Scylla.Timeout", 10, time.Second), + ttl: conf.GetInt("Scylla.TTL", 1209600), + sampling: conf.GetFloat64("DB.sampling", 0.00001), + }, nil +} + +func getConnectionString(config *config.Config) string { + host := config.GetString("DB.host", "localhost") + user := config.GetString("DB.user", "rudder") + dbname := config.GetString("DB.name", "jobsdb") + port := config.GetInt("DB.port", 6432) + password := config.GetString("DB.password", "password") + sslmode := config.GetString("DB.sslMode", "disable") + idleTxTimeout := config.GetDuration("DB.IdleTxTimeout", 5, time.Minute) + + return fmt.Sprintf("host=%s port=%d user=%s "+ + "password=%s dbname=%s sslmode=%s "+ + " options='-c idle_in_transaction_session_timeout=%d'", + host, port, user, password, dbname, sslmode, + idleTxTimeout.Milliseconds(), + ) +} + +func Go(function func()) { + go func() { + function() + }() +} + +func GoForWarehouse(function func()) { + go func() { + function() + }() +} + +var GoRoutineFactory goRoutineFactory + +type goRoutineFactory struct{} + +func (goRoutineFactory) Go(function func()) { + Go(function) +} From efb7c8c409ca9ba4b4b713b7e075ea110685c6aa Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Tue, 23 Jul 2024 17:38:24 +0530 Subject: [PATCH 03/16] chore: review comments --- mocks/services/dedup/mock_dedup.go | 5 ++- processor/processor.go | 2 +- processor/processor_test.go | 2 +- .../yandexmetrica/yandexmetrica_test.go | 6 +-- services/dedup/badger/badger.go | 38 +++++++++++++------ services/dedup/dedup.go | 14 ++++--- services/dedup/dedup_test.go | 34 ++++++++++------- 7 files changed, 63 insertions(+), 38 deletions(-) diff --git a/mocks/services/dedup/mock_dedup.go b/mocks/services/dedup/mock_dedup.go index b252034541..db321ec21e 100644 --- a/mocks/services/dedup/mock_dedup.go +++ b/mocks/services/dedup/mock_dedup.go @@ -66,12 +66,13 @@ func (mr *MockDedupMockRecorder) Commit(arg0 any) *gomock.Call { } // Set mocks base method. -func (m *MockDedup) Set(arg0 types.KeyValue) (bool, int64) { +func (m *MockDedup) Set(arg0 types.KeyValue) (bool, int64, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "Set", arg0) ret0, _ := ret[0].(bool) ret1, _ := ret[1].(int64) - return ret0, ret1 + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 } // Set indicates an expected call of Set. diff --git a/processor/processor.go b/processor/processor.go index 1bea4225c9..c1f738844c 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -1709,7 +1709,7 @@ func (proc *Handle) processJobsForDest(partition string, subJobs subJob) *transf p := payloadFunc() messageSize := int64(len(p)) dedupKey := fmt.Sprintf("%v%v", messageId, eventParams.SourceJobRunId) - if ok, previousSize := proc.dedup.Set(dedupTypes.KeyValue{Key: dedupKey, Value: messageSize}); !ok { + if ok, previousSize, err := proc.dedup.Set(dedupTypes.KeyValue{Key: dedupKey, Value: messageSize}); !ok && err != nil { proc.logger.Debugf("Dropping event with duplicate dedupKey: %s", dedupKey) sourceDupStats[dupStatKey{sourceID: source.ID, equalSize: messageSize == previousSize}] += 1 continue diff --git a/processor/processor_test.go b/processor/processor_test.go index 03baee4782..7aac77a32a 100644 --- a/processor/processor_test.go +++ b/processor/processor_test.go @@ -2471,7 +2471,7 @@ var _ = Describe("Processor", Ordered, func() { mockTransformer := mocksTransformer.NewMockTransformer(c.mockCtrl) callUnprocessed := c.mockGatewayJobsDB.EXPECT().GetUnprocessed(gomock.Any(), gomock.Any()).Return(jobsdb.JobsResult{Jobs: unprocessedJobsList}, nil).Times(1) - c.MockDedup.EXPECT().Set(gomock.Any()).Return(true, int64(0)).After(callUnprocessed).Times(3) + c.MockDedup.EXPECT().Set(gomock.Any()).Return(true, int64(0), gomock.Any()).After(callUnprocessed).Times(3) c.MockDedup.EXPECT().Commit(gomock.Any()).Times(1) // We expect one transform call to destination A, after callUnprocessed. diff --git a/router/batchrouter/asyncdestinationmanager/yandexmetrica/yandexmetrica_test.go b/router/batchrouter/asyncdestinationmanager/yandexmetrica/yandexmetrica_test.go index 70abe0215e..07ae04f714 100644 --- a/router/batchrouter/asyncdestinationmanager/yandexmetrica/yandexmetrica_test.go +++ b/router/batchrouter/asyncdestinationmanager/yandexmetrica/yandexmetrica_test.go @@ -46,15 +46,15 @@ var ( } ) -var _ = Describe("Yandexmetrica", func() { - Describe(("NewManager function test"), func() { +var _ = Describe("Antisymmetric", func() { + Describe("NewManager function test", func() { It("should return yandexmetrica manager", func() { yandexmetrica, err := yandexmetrica.NewManager(destination, backendconfig.DefaultBackendConfig) Expect(err).To(BeNil()) Expect(yandexmetrica).NotTo(BeNil()) }) }) - Describe(("Upload function test"), func() { + Describe("Upload function test", func() { It("Testing a successful scenario", func() { cache := oauthv2.NewCache() ctrl := gomock.NewController(GinkgoT()) diff --git a/services/dedup/badger/badger.go b/services/dedup/badger/badger.go index 181a7f6bf3..30f4a2af8d 100644 --- a/services/dedup/badger/badger.go +++ b/services/dedup/badger/badger.go @@ -3,6 +3,7 @@ package badger import ( "fmt" "strconv" + "sync" "time" "github.com/dgraph-io/badger/v4" @@ -26,6 +27,7 @@ type BadgerDB struct { gcDone chan struct{} path string opts badger.Options + once sync.Once } // DefaultPath returns the default path for the deduplication service's badger DB @@ -68,10 +70,15 @@ func NewBadgerDB(path string) *BadgerDB { return db } -func (d *BadgerDB) Get(key string) (int64, bool) { +func (d *BadgerDB) Get(key string) (int64, bool, error) { var payloadSize int64 var found bool - err := d.badgerDB.View(func(txn *badger.Txn) error { + var err error + err = d.init() + if err != nil { + return 0, false, err + } + err = d.badgerDB.View(func(txn *badger.Txn) error { item, err := txn.Get([]byte(key)) if err != nil { return err @@ -83,12 +90,16 @@ func (d *BadgerDB) Get(key string) (int64, bool) { return nil }) if err != nil && err != badger.ErrKeyNotFound { - panic(err) + return 0, false, err } - return payloadSize, found + return payloadSize, found, nil } func (d *BadgerDB) Set(kvs []types.KeyValue) error { + err := d.init() + if err != nil { + return err + } txn := d.badgerDB.NewTransaction(true) for _, message := range kvs { value := strconv.FormatInt(message.Value, 10) @@ -115,17 +126,20 @@ func (d *BadgerDB) Close() { _ = d.badgerDB.Close() } -func (d *BadgerDB) Start() { +func (d *BadgerDB) init() error { var err error - d.badgerDB, err = badger.Open(d.opts) - if err != nil { - panic(err) - } - rruntime.Go(func() { - d.gcLoop() - close(d.gcDone) + d.once.Do(func() { + d.badgerDB, err = badger.Open(d.opts) + if err != nil { + return + } + rruntime.Go(func() { + d.gcLoop() + close(d.gcDone) + }) }) + return err } func (d *BadgerDB) gcLoop() { diff --git a/services/dedup/dedup.go b/services/dedup/dedup.go index 89d3b41f7b..6efc08c99e 100644 --- a/services/dedup/dedup.go +++ b/services/dedup/dedup.go @@ -13,7 +13,6 @@ import ( // New creates a new deduplication service. The service needs to be closed after use. func New() Dedup { db := badger.NewBadgerDB(badger.DefaultPath()) - db.Start() return &dedup{ badgerDB: db, cache: make(map[string]int64), @@ -23,7 +22,7 @@ func New() Dedup { // Dedup is the interface for deduplication service type Dedup interface { // Set returns [true] if it was the first time the key was encountered, otherwise it returns [false] along with the previous value - Set(kv types.KeyValue) (bool, int64) + Set(kv types.KeyValue) (bool, int64, error) // Commit commits a list of previously set keys to the DB Commit(keys []string) error @@ -38,17 +37,20 @@ type dedup struct { cache map[string]int64 } -func (d *dedup) Set(kv types.KeyValue) (bool, int64) { +func (d *dedup) Set(kv types.KeyValue) (bool, int64, error) { d.cacheMu.Lock() defer d.cacheMu.Unlock() if previous, found := d.cache[kv.Key]; found { - return false, previous + return false, previous, nil + } + previous, found, err := d.badgerDB.Get(kv.Key) + if err != nil { + return false, 0, err } - previous, found := d.badgerDB.Get(kv.Key) if !found { d.cache[kv.Key] = kv.Value } - return !found, previous + return !found, previous, nil } func (d *dedup) Commit(keys []string) error { diff --git a/services/dedup/dedup_test.go b/services/dedup/dedup_test.go index dd7f641fb6..7d944c6d15 100644 --- a/services/dedup/dedup_test.go +++ b/services/dedup/dedup_test.go @@ -31,32 +31,37 @@ func Test_Dedup(t *testing.T) { defer d.Close() t.Run("if message id is not present in cache and badger db", func(t *testing.T) { - found, _ := d.Set(types.KeyValue{Key: "a", Value: 1}) + found, _, err := d.Set(types.KeyValue{Key: "a", Value: 1}) + require.Nil(t, err) require.Equal(t, true, found) // Checking it again should give us the previous value from the cache - found, value := d.Set(types.KeyValue{Key: "a", Value: 2}) + found, value, err := d.Set(types.KeyValue{Key: "a", Value: 2}) + require.Nil(t, err) require.Equal(t, false, found) require.Equal(t, int64(1), value) }) t.Run("if message is committed, previous value should always return", func(t *testing.T) { - found, _ := d.Set(types.KeyValue{Key: "b", Value: 1}) + found, _, err := d.Set(types.KeyValue{Key: "b", Value: 1}) + require.Nil(t, err) require.Equal(t, true, found) - err := d.Commit([]string{"a"}) + err = d.Commit([]string{"a"}) require.NoError(t, err) - found, value := d.Set(types.KeyValue{Key: "b", Value: 2}) + found, value, err := d.Set(types.KeyValue{Key: "b", Value: 2}) + require.Nil(t, err) require.Equal(t, false, found) require.Equal(t, int64(1), value) }) t.Run("committing a messageid not present in cache", func(t *testing.T) { - found, _ := d.Set(types.KeyValue{Key: "c", Value: 1}) + found, _, err := d.Set(types.KeyValue{Key: "c", Value: 1}) + require.Nil(t, err) require.Equal(t, true, found) - err := d.Commit([]string{"d"}) + err = d.Commit([]string{"d"}) require.NotNil(t, err) }) } @@ -73,17 +78,20 @@ func Test_Dedup_Window(t *testing.T) { d := dedup.New() defer d.Close() - found, _ := d.Set(types.KeyValue{Key: "to be deleted", Value: 1}) + found, _, err := d.Set(types.KeyValue{Key: "to be deleted", Value: 1}) + require.Nil(t, err) require.Equal(t, true, found) - err := d.Commit([]string{"to be deleted"}) + err = d.Commit([]string{"to be deleted"}) require.NoError(t, err) - found, _ = d.Set(types.KeyValue{Key: "to be deleted", Value: 2}) + found, _, err = d.Set(types.KeyValue{Key: "to be deleted", Value: 2}) + require.Nil(t, err) require.Equal(t, false, found) require.Eventually(t, func() bool { - found, _ = d.Set(types.KeyValue{Key: "to be deleted", Value: 3}) + found, _, err = d.Set(types.KeyValue{Key: "to be deleted", Value: 3}) + require.Nil(t, err) return found }, 2*time.Second, 100*time.Millisecond) } @@ -103,7 +111,7 @@ func Test_Dedup_ErrTxnTooBig(t *testing.T) { messages := make([]string, size) for i := 0; i < size; i++ { messages[i] = uuid.New().String() - d.Set(types.KeyValue{Key: messages[i], Value: int64(i + 1)}) + _, _, _ = d.Set(types.KeyValue{Key: messages[i], Value: int64(i + 1)}) } err := d.Commit(messages) require.NoError(t, err) @@ -134,7 +142,7 @@ func Benchmark_Dedup(b *testing.B) { if i%batchSize == batchSize-1 || i == b.N-1 { for _, msgID := range msgIDs[:i%batchSize] { - d.Set(msgID) + _, _, _ = d.Set(msgID) keys = append(keys, msgID.Key) } err := d.Commit(keys) From 35a0edfed7e2cd6bd303a73c4ca97a7e99d89061 Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Tue, 23 Jul 2024 17:38:24 +0530 Subject: [PATCH 04/16] chore: review comments --- mocks/services/dedup/mock_dedup.go | 5 ++- processor/processor.go | 2 +- processor/processor_test.go | 2 +- .../yandexmetrica/yandexmetrica_test.go | 6 +-- services/dedup/badger/badger.go | 38 +++++++++++++------ services/dedup/dedup.go | 14 ++++--- services/dedup/dedup_test.go | 34 ++++++++++------- 7 files changed, 63 insertions(+), 38 deletions(-) diff --git a/mocks/services/dedup/mock_dedup.go b/mocks/services/dedup/mock_dedup.go index b252034541..db321ec21e 100644 --- a/mocks/services/dedup/mock_dedup.go +++ b/mocks/services/dedup/mock_dedup.go @@ -66,12 +66,13 @@ func (mr *MockDedupMockRecorder) Commit(arg0 any) *gomock.Call { } // Set mocks base method. -func (m *MockDedup) Set(arg0 types.KeyValue) (bool, int64) { +func (m *MockDedup) Set(arg0 types.KeyValue) (bool, int64, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "Set", arg0) ret0, _ := ret[0].(bool) ret1, _ := ret[1].(int64) - return ret0, ret1 + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 } // Set indicates an expected call of Set. diff --git a/processor/processor.go b/processor/processor.go index 1bea4225c9..c1f738844c 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -1709,7 +1709,7 @@ func (proc *Handle) processJobsForDest(partition string, subJobs subJob) *transf p := payloadFunc() messageSize := int64(len(p)) dedupKey := fmt.Sprintf("%v%v", messageId, eventParams.SourceJobRunId) - if ok, previousSize := proc.dedup.Set(dedupTypes.KeyValue{Key: dedupKey, Value: messageSize}); !ok { + if ok, previousSize, err := proc.dedup.Set(dedupTypes.KeyValue{Key: dedupKey, Value: messageSize}); !ok && err != nil { proc.logger.Debugf("Dropping event with duplicate dedupKey: %s", dedupKey) sourceDupStats[dupStatKey{sourceID: source.ID, equalSize: messageSize == previousSize}] += 1 continue diff --git a/processor/processor_test.go b/processor/processor_test.go index 03baee4782..222b07f6c7 100644 --- a/processor/processor_test.go +++ b/processor/processor_test.go @@ -2471,7 +2471,7 @@ var _ = Describe("Processor", Ordered, func() { mockTransformer := mocksTransformer.NewMockTransformer(c.mockCtrl) callUnprocessed := c.mockGatewayJobsDB.EXPECT().GetUnprocessed(gomock.Any(), gomock.Any()).Return(jobsdb.JobsResult{Jobs: unprocessedJobsList}, nil).Times(1) - c.MockDedup.EXPECT().Set(gomock.Any()).Return(true, int64(0)).After(callUnprocessed).Times(3) + c.MockDedup.EXPECT().Set(gomock.Any()).Return(true, int64(0), nil).After(callUnprocessed).Times(3) c.MockDedup.EXPECT().Commit(gomock.Any()).Times(1) // We expect one transform call to destination A, after callUnprocessed. diff --git a/router/batchrouter/asyncdestinationmanager/yandexmetrica/yandexmetrica_test.go b/router/batchrouter/asyncdestinationmanager/yandexmetrica/yandexmetrica_test.go index 70abe0215e..07ae04f714 100644 --- a/router/batchrouter/asyncdestinationmanager/yandexmetrica/yandexmetrica_test.go +++ b/router/batchrouter/asyncdestinationmanager/yandexmetrica/yandexmetrica_test.go @@ -46,15 +46,15 @@ var ( } ) -var _ = Describe("Yandexmetrica", func() { - Describe(("NewManager function test"), func() { +var _ = Describe("Antisymmetric", func() { + Describe("NewManager function test", func() { It("should return yandexmetrica manager", func() { yandexmetrica, err := yandexmetrica.NewManager(destination, backendconfig.DefaultBackendConfig) Expect(err).To(BeNil()) Expect(yandexmetrica).NotTo(BeNil()) }) }) - Describe(("Upload function test"), func() { + Describe("Upload function test", func() { It("Testing a successful scenario", func() { cache := oauthv2.NewCache() ctrl := gomock.NewController(GinkgoT()) diff --git a/services/dedup/badger/badger.go b/services/dedup/badger/badger.go index 181a7f6bf3..30f4a2af8d 100644 --- a/services/dedup/badger/badger.go +++ b/services/dedup/badger/badger.go @@ -3,6 +3,7 @@ package badger import ( "fmt" "strconv" + "sync" "time" "github.com/dgraph-io/badger/v4" @@ -26,6 +27,7 @@ type BadgerDB struct { gcDone chan struct{} path string opts badger.Options + once sync.Once } // DefaultPath returns the default path for the deduplication service's badger DB @@ -68,10 +70,15 @@ func NewBadgerDB(path string) *BadgerDB { return db } -func (d *BadgerDB) Get(key string) (int64, bool) { +func (d *BadgerDB) Get(key string) (int64, bool, error) { var payloadSize int64 var found bool - err := d.badgerDB.View(func(txn *badger.Txn) error { + var err error + err = d.init() + if err != nil { + return 0, false, err + } + err = d.badgerDB.View(func(txn *badger.Txn) error { item, err := txn.Get([]byte(key)) if err != nil { return err @@ -83,12 +90,16 @@ func (d *BadgerDB) Get(key string) (int64, bool) { return nil }) if err != nil && err != badger.ErrKeyNotFound { - panic(err) + return 0, false, err } - return payloadSize, found + return payloadSize, found, nil } func (d *BadgerDB) Set(kvs []types.KeyValue) error { + err := d.init() + if err != nil { + return err + } txn := d.badgerDB.NewTransaction(true) for _, message := range kvs { value := strconv.FormatInt(message.Value, 10) @@ -115,17 +126,20 @@ func (d *BadgerDB) Close() { _ = d.badgerDB.Close() } -func (d *BadgerDB) Start() { +func (d *BadgerDB) init() error { var err error - d.badgerDB, err = badger.Open(d.opts) - if err != nil { - panic(err) - } - rruntime.Go(func() { - d.gcLoop() - close(d.gcDone) + d.once.Do(func() { + d.badgerDB, err = badger.Open(d.opts) + if err != nil { + return + } + rruntime.Go(func() { + d.gcLoop() + close(d.gcDone) + }) }) + return err } func (d *BadgerDB) gcLoop() { diff --git a/services/dedup/dedup.go b/services/dedup/dedup.go index 89d3b41f7b..6efc08c99e 100644 --- a/services/dedup/dedup.go +++ b/services/dedup/dedup.go @@ -13,7 +13,6 @@ import ( // New creates a new deduplication service. The service needs to be closed after use. func New() Dedup { db := badger.NewBadgerDB(badger.DefaultPath()) - db.Start() return &dedup{ badgerDB: db, cache: make(map[string]int64), @@ -23,7 +22,7 @@ func New() Dedup { // Dedup is the interface for deduplication service type Dedup interface { // Set returns [true] if it was the first time the key was encountered, otherwise it returns [false] along with the previous value - Set(kv types.KeyValue) (bool, int64) + Set(kv types.KeyValue) (bool, int64, error) // Commit commits a list of previously set keys to the DB Commit(keys []string) error @@ -38,17 +37,20 @@ type dedup struct { cache map[string]int64 } -func (d *dedup) Set(kv types.KeyValue) (bool, int64) { +func (d *dedup) Set(kv types.KeyValue) (bool, int64, error) { d.cacheMu.Lock() defer d.cacheMu.Unlock() if previous, found := d.cache[kv.Key]; found { - return false, previous + return false, previous, nil + } + previous, found, err := d.badgerDB.Get(kv.Key) + if err != nil { + return false, 0, err } - previous, found := d.badgerDB.Get(kv.Key) if !found { d.cache[kv.Key] = kv.Value } - return !found, previous + return !found, previous, nil } func (d *dedup) Commit(keys []string) error { diff --git a/services/dedup/dedup_test.go b/services/dedup/dedup_test.go index dd7f641fb6..7d944c6d15 100644 --- a/services/dedup/dedup_test.go +++ b/services/dedup/dedup_test.go @@ -31,32 +31,37 @@ func Test_Dedup(t *testing.T) { defer d.Close() t.Run("if message id is not present in cache and badger db", func(t *testing.T) { - found, _ := d.Set(types.KeyValue{Key: "a", Value: 1}) + found, _, err := d.Set(types.KeyValue{Key: "a", Value: 1}) + require.Nil(t, err) require.Equal(t, true, found) // Checking it again should give us the previous value from the cache - found, value := d.Set(types.KeyValue{Key: "a", Value: 2}) + found, value, err := d.Set(types.KeyValue{Key: "a", Value: 2}) + require.Nil(t, err) require.Equal(t, false, found) require.Equal(t, int64(1), value) }) t.Run("if message is committed, previous value should always return", func(t *testing.T) { - found, _ := d.Set(types.KeyValue{Key: "b", Value: 1}) + found, _, err := d.Set(types.KeyValue{Key: "b", Value: 1}) + require.Nil(t, err) require.Equal(t, true, found) - err := d.Commit([]string{"a"}) + err = d.Commit([]string{"a"}) require.NoError(t, err) - found, value := d.Set(types.KeyValue{Key: "b", Value: 2}) + found, value, err := d.Set(types.KeyValue{Key: "b", Value: 2}) + require.Nil(t, err) require.Equal(t, false, found) require.Equal(t, int64(1), value) }) t.Run("committing a messageid not present in cache", func(t *testing.T) { - found, _ := d.Set(types.KeyValue{Key: "c", Value: 1}) + found, _, err := d.Set(types.KeyValue{Key: "c", Value: 1}) + require.Nil(t, err) require.Equal(t, true, found) - err := d.Commit([]string{"d"}) + err = d.Commit([]string{"d"}) require.NotNil(t, err) }) } @@ -73,17 +78,20 @@ func Test_Dedup_Window(t *testing.T) { d := dedup.New() defer d.Close() - found, _ := d.Set(types.KeyValue{Key: "to be deleted", Value: 1}) + found, _, err := d.Set(types.KeyValue{Key: "to be deleted", Value: 1}) + require.Nil(t, err) require.Equal(t, true, found) - err := d.Commit([]string{"to be deleted"}) + err = d.Commit([]string{"to be deleted"}) require.NoError(t, err) - found, _ = d.Set(types.KeyValue{Key: "to be deleted", Value: 2}) + found, _, err = d.Set(types.KeyValue{Key: "to be deleted", Value: 2}) + require.Nil(t, err) require.Equal(t, false, found) require.Eventually(t, func() bool { - found, _ = d.Set(types.KeyValue{Key: "to be deleted", Value: 3}) + found, _, err = d.Set(types.KeyValue{Key: "to be deleted", Value: 3}) + require.Nil(t, err) return found }, 2*time.Second, 100*time.Millisecond) } @@ -103,7 +111,7 @@ func Test_Dedup_ErrTxnTooBig(t *testing.T) { messages := make([]string, size) for i := 0; i < size; i++ { messages[i] = uuid.New().String() - d.Set(types.KeyValue{Key: messages[i], Value: int64(i + 1)}) + _, _, _ = d.Set(types.KeyValue{Key: messages[i], Value: int64(i + 1)}) } err := d.Commit(messages) require.NoError(t, err) @@ -134,7 +142,7 @@ func Benchmark_Dedup(b *testing.B) { if i%batchSize == batchSize-1 || i == b.N-1 { for _, msgID := range msgIDs[:i%batchSize] { - d.Set(msgID) + _, _, _ = d.Set(msgID) keys = append(keys, msgID.Key) } err := d.Commit(keys) From 8e393ae5859a5a0870cef5564ce44d8279b3bdc3 Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Wed, 24 Jul 2024 14:24:23 +0530 Subject: [PATCH 05/16] chore: refactor dedup package --- processor/manager.go | 6 ++- processor/processor.go | 9 +++- processor/processor_test.go | 19 +++++--- services/dedup/badger/badger.go | 55 ++++++++++++++++++++++- services/dedup/dedup.go | 77 +++++---------------------------- services/dedup/dedup_test.go | 23 +++++----- services/dedup/scylla/scylla.go | 15 ++++++- 7 files changed, 115 insertions(+), 89 deletions(-) diff --git a/processor/manager.go b/processor/manager.go index 07b03b87bf..a8351005fb 100644 --- a/processor/manager.go +++ b/processor/manager.go @@ -56,7 +56,7 @@ func (proc *LifecycleManager) Start() error { proc.Handle.transformer = proc.Transformer } - proc.Handle.Setup( + if err := proc.Handle.Setup( proc.BackendConfig, proc.gatewayDB, proc.routerDB, @@ -74,7 +74,9 @@ func (proc *LifecycleManager) Start() error { proc.transDebugger, proc.enrichers, proc.trackedUsersReporter, - ) + ); err != nil { + return err + } currentCtx, cancel := context.WithCancel(context.Background()) proc.currentCancel = cancel diff --git a/processor/processor.go b/processor/processor.go index 0680312150..1170b693a5 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -380,7 +380,7 @@ func (proc *Handle) Setup( transDebugger transformationdebugger.TransformationDebugger, enrichers []enricher.PipelineEnricher, trackedUsersReporter trackedusers.UsersReporter, -) { +) error { proc.reporting = reporting proc.destDebugger = destDebugger proc.transDebugger = transDebugger @@ -614,7 +614,11 @@ func (proc *Handle) Setup( }) } if proc.config.enableDedup { - proc.dedup = dedup.New(proc.conf, proc.statsFactory) + var err error + proc.dedup, err = dedup.New(proc.conf, proc.statsFactory) + if err != nil { + return err + } } proc.sourceObservers = []sourceObserver{delayed.NewEventStats(proc.statsFactory, proc.conf)} ctx, cancel := context.WithCancel(context.Background()) @@ -643,6 +647,7 @@ func (proc *Handle) Setup( })) proc.crashRecover() + return nil } func (proc *Handle) setupReloadableVars() { diff --git a/processor/processor_test.go b/processor/processor_test.go index 222b07f6c7..8a0d2b5895 100644 --- a/processor/processor_test.go +++ b/processor/processor_test.go @@ -1812,7 +1812,7 @@ var _ = Describe("Processor", Ordered, func() { // crash recover returns empty list c.mockGatewayJobsDB.EXPECT().DeleteExecuting().Times(1) - processor.Setup( + err := processor.Setup( c.mockBackendConfig, c.mockGatewayJobsDB, c.mockRouterJobsDB, @@ -1831,6 +1831,7 @@ var _ = Describe("Processor", Ordered, func() { []enricher.PipelineEnricher{}, trackedusers.NewNoopDataCollector(), ) + Expect(err).To(BeNil()) ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() Expect(processor.config.asyncInit.WaitContext(ctx)).To(BeNil()) @@ -1843,7 +1844,7 @@ var _ = Describe("Processor", Ordered, func() { c.mockGatewayJobsDB.EXPECT().DeleteExecuting().Times(1) - processor.Setup( + err := processor.Setup( c.mockBackendConfig, c.mockGatewayJobsDB, c.mockRouterJobsDB, @@ -1862,6 +1863,7 @@ var _ = Describe("Processor", Ordered, func() { []enricher.PipelineEnricher{}, trackedusers.NewNoopDataCollector(), ) + Expect(err).To(BeNil()) ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() Expect(processor.config.asyncInit.WaitContext(ctx)).To(BeNil()) @@ -1879,7 +1881,7 @@ var _ = Describe("Processor", Ordered, func() { processor := prepareHandle(NewHandle(config.Default, mockTransformer)) - processor.Setup( + err := processor.Setup( c.mockBackendConfig, c.mockGatewayJobsDB, c.mockRouterJobsDB, @@ -1898,6 +1900,7 @@ var _ = Describe("Processor", Ordered, func() { []enricher.PipelineEnricher{}, trackedusers.NewNoopDataCollector(), ) + Expect(err).To(BeNil()) ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() Expect(processor.config.asyncInit.WaitContext(ctx)).To(BeNil()) @@ -2982,7 +2985,7 @@ var _ = Describe("Processor", Ordered, func() { // crash recover returns empty list c.mockGatewayJobsDB.EXPECT().DeleteExecuting().Times(1) - processor.Setup( + err := processor.Setup( c.mockBackendConfig, c.mockGatewayJobsDB, c.mockRouterJobsDB, @@ -3001,7 +3004,7 @@ var _ = Describe("Processor", Ordered, func() { []enricher.PipelineEnricher{}, trackedusers.NewNoopDataCollector(), ) - + Expect(err).To(BeNil()) setMainLoopTimeout(processor, 1*time.Second) ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) @@ -3041,7 +3044,7 @@ var _ = Describe("Processor", Ordered, func() { // crash recover returns empty list c.mockGatewayJobsDB.EXPECT().DeleteExecuting().Times(1) - processor.Setup( + err := processor.Setup( c.mockBackendConfig, c.mockGatewayJobsDB, c.mockRouterJobsDB, @@ -3060,6 +3063,7 @@ var _ = Describe("Processor", Ordered, func() { []enricher.PipelineEnricher{}, trackedusers.NewNoopDataCollector(), ) + Expect(err).To(BeNil()) defer processor.Shutdown() processor.config.readLoopSleep = config.SingleValueLoader(time.Millisecond) @@ -5049,7 +5053,7 @@ func processorSetupAndAssertJobHandling(processor *Handle, c *testContext) { func Setup(processor *Handle, c *testContext, enableDedup, enableReporting bool) { setDisableDedupFeature(processor, enableDedup) - processor.Setup( + err := processor.Setup( c.mockBackendConfig, c.mockGatewayJobsDB, c.mockRouterJobsDB, @@ -5068,6 +5072,7 @@ func Setup(processor *Handle, c *testContext, enableDedup, enableReporting bool) []enricher.PipelineEnricher{}, trackedusers.NewNoopDataCollector(), ) + Expect(err).To(BeNil()) processor.reportingEnabled = enableReporting processor.sourceObservers = []sourceObserver{c.MockObserver} } diff --git a/services/dedup/badger/badger.go b/services/dedup/badger/badger.go index b8943d090a..0828430b49 100644 --- a/services/dedup/badger/badger.go +++ b/services/dedup/badger/badger.go @@ -40,7 +40,7 @@ func DefaultPath() string { return fmt.Sprintf(`%v%v`, tmpDirPath, badgerPathName) } -func NewBadgerDB(conf *config.Config, stats stats.Stats, path string) *BadgerDB { +func NewBadgerDB(conf *config.Config, stats stats.Stats, path string) *dedup { dedupWindow := conf.GetReloadableDurationVar(3600, time.Second, "Dedup.dedupWindow", "Dedup.dedupWindowInS") log := logger.NewLogger().Child("dedup") @@ -67,7 +67,10 @@ func NewBadgerDB(conf *config.Config, stats stats.Stats, path string) *BadgerDB window: dedupWindow, opts: badgerOpts, } - return db + return &dedup{ + badgerDB: db, + cache: make(map[string]int64), + } } func (d *BadgerDB) Get(key string) (int64, bool, error) { @@ -171,6 +174,54 @@ func (d *BadgerDB) gcLoop() { } } +type dedup struct { + badgerDB *BadgerDB + cacheMu sync.Mutex + cache map[string]int64 +} + +func (d *dedup) Set(kv types.KeyValue) (bool, int64, error) { + d.cacheMu.Lock() + defer d.cacheMu.Unlock() + if previous, found := d.cache[kv.Key]; found { + return false, previous, nil + } + previous, found, err := d.badgerDB.Get(kv.Key) + if err != nil { + return false, 0, err + } + if !found { + d.cache[kv.Key] = kv.Value + } + return !found, previous, nil +} + +func (d *dedup) Commit(keys []string) error { + d.cacheMu.Lock() + defer d.cacheMu.Unlock() + + kvs := make([]types.KeyValue, len(keys)) + for i, key := range keys { + value, ok := d.cache[key] + if !ok { + return fmt.Errorf("key %v has not been previously set", key) + } + kvs[i] = types.KeyValue{Key: key, Value: value} + } + + err := d.badgerDB.Set(kvs) + if err == nil { + for _, kv := range kvs { + delete(d.cache, kv.Key) + } + } + return err +} + +func (d *dedup) Close() { + d.badgerDB.Close() +} + type loggerForBadger struct { logger.Logger } diff --git a/services/dedup/dedup.go b/services/dedup/dedup.go index a5b9763abd..de81ff5f13 100644 --- a/services/dedup/dedup.go +++ b/services/dedup/dedup.go @@ -3,17 +3,14 @@ package dedup import ( - "fmt" - "sync" - - "github.com/rudderlabs/rudder-go-kit/stats" - "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/stats" "github.com/rudderlabs/rudder-server/services/dedup/badger" + "github.com/rudderlabs/rudder-server/services/dedup/scylla" "github.com/rudderlabs/rudder-server/services/dedup/types" ) -type Mode string // skipcq: RVV-B0009 +type Mode string const ( Badger Mode = "Badger" @@ -22,23 +19,21 @@ const ( ) // New creates a new deduplication service. The service needs to be closed after use. -func New(conf *config.Config, stats stats.Stats) Dedup { +func New(conf *config.Config, stats stats.Stats) (Dedup, error) { mode := Mode(config.GetString("Dedup.Mode", string(Badger))) switch mode { case Badger: - return &dedup{ - badgerDB: badger.NewBadgerDB(conf, stats, badger.DefaultPath()), - cache: make(map[string]int64), - } + return badger.NewBadgerDB(conf, stats, badger.DefaultPath()), nil case Scylla: - return nil + scylla, err := scylla.New(conf, stats) + if err != nil { + return nil, err + } + return scylla, nil case Dual: - return nil + return nil, nil default: - return &dedup{ - badgerDB: badger.NewBadgerDB(conf, stats, badger.DefaultPath()), - cache: make(map[string]int64), - } + return badger.NewBadgerDB(conf, stats, badger.DefaultPath()), nil } } @@ -53,51 +48,3 @@ type Dedup interface { // Close closes the deduplication service Close() } - -type dedup struct { - badgerDB *badger.BadgerDB - cacheMu sync.Mutex - cache map[string]int64 -} - -func (d *dedup) Set(kv types.KeyValue) (bool, int64, error) { - d.cacheMu.Lock() - defer d.cacheMu.Unlock() - if previous, found := d.cache[kv.Key]; found { - return false, previous, nil - } - previous, found, err := d.badgerDB.Get(kv.Key) - if err != nil { - return false, 0, err - } - if !found { - d.cache[kv.Key] = kv.Value - } - return !found, previous, nil -} - -func (d *dedup) Commit(keys []string) error { - d.cacheMu.Lock() - defer d.cacheMu.Unlock() - - kvs := make([]types.KeyValue, len(keys)) - for i, key := range keys { - value, ok := d.cache[key] - if !ok { - return fmt.Errorf("key %v has not been previously set", key) - } - kvs[i] = types.KeyValue{Key: key, Value: value} - } - - err := d.badgerDB.Set(kvs) - if err == nil { - for _, kv := range kvs { - delete(d.cache, kv.Key) - } - } - return err -} - -func (d *dedup) Close() { - d.badgerDB.Close() -} diff --git a/services/dedup/dedup_test.go b/services/dedup/dedup_test.go index 0dd9fbb6cb..154c861eef 100644 --- a/services/dedup/dedup_test.go +++ b/services/dedup/dedup_test.go @@ -7,18 +7,17 @@ import ( "testing" "time" - "github.com/rudderlabs/rudder-go-kit/stats" - "github.com/google/uuid" "github.com/stretchr/testify/require" - "github.com/rudderlabs/rudder-go-kit/testhelper/rand" - "github.com/rudderlabs/rudder-server/utils/misc" - "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-go-kit/testhelper/rand" + "github.com/rudderlabs/rudder-server/services/dedup" "github.com/rudderlabs/rudder-server/services/dedup/types" + "github.com/rudderlabs/rudder-server/utils/misc" ) func Test_Dedup(t *testing.T) { @@ -29,7 +28,8 @@ func Test_Dedup(t *testing.T) { dbPath := os.TempDir() + "/dedup_test" defer func() { _ = os.RemoveAll(dbPath) }() _ = os.RemoveAll(dbPath) - d := dedup.New(config.New(), stats.Default) + d, err := dedup.New(config.New(), stats.Default) + require.Nil(t, err) defer d.Close() t.Run("if message id is not present in cache and badger db", func(t *testing.T) { @@ -77,7 +77,8 @@ func Test_Dedup_Window(t *testing.T) { defer func() { _ = os.RemoveAll(dbPath) }() _ = os.RemoveAll(dbPath) config.Set("Dedup.dedupWindow", "1s") - d := dedup.New(config.New(), stats.Default) + d, err := dedup.New(config.New(), stats.Default) + require.Nil(t, err) defer d.Close() found, _, err := d.Set(types.KeyValue{Key: "to be deleted", Value: 1}) @@ -106,7 +107,8 @@ func Test_Dedup_ErrTxnTooBig(t *testing.T) { dbPath := os.TempDir() + "/dedup_test_errtxntoobig" defer os.RemoveAll(dbPath) os.RemoveAll(dbPath) - d := dedup.New(config.New(), stats.Default) + d, err := dedup.New(config.New(), stats.Default) + require.Nil(t, err) defer d.Close() size := 105_000 @@ -115,7 +117,7 @@ func Test_Dedup_ErrTxnTooBig(t *testing.T) { messages[i] = uuid.New().String() _, _, _ = d.Set(types.KeyValue{Key: messages[i], Value: int64(i + 1)}) } - err := d.Commit(messages) + err = d.Commit(messages) require.NoError(t, err) } @@ -128,7 +130,8 @@ func Benchmark_Dedup(b *testing.B) { b.Logf("using path %s, since tmpDir has issues in macOS\n", dbPath) defer func() { _ = os.RemoveAll(dbPath) }() _ = os.MkdirAll(dbPath, 0o750) - d := dedup.New(config.New(), stats.Default) + d, err := dedup.New(config.New(), stats.Default) + require.NoError(b, err) b.Run("no duplicates 1000 batch unique", func(b *testing.B) { batchSize := 1000 diff --git a/services/dedup/scylla/scylla.go b/services/dedup/scylla/scylla.go index 1840f41eed..dfc99efcb1 100644 --- a/services/dedup/scylla/scylla.go +++ b/services/dedup/scylla/scylla.go @@ -8,10 +8,11 @@ import ( "time" "github.com/gocql/gocql" - "github.com/google/uuid" + "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-server/services/dedup/types" ) var ( @@ -94,6 +95,18 @@ func (d *scyllaDB) BenchmarkNewKeys() { } } +func (d *scyllaDB) Close() { + d.scylla.Close() +} + +func (d *scyllaDB) Set(kv types.KeyValue) (bool, int64, error) { + return false, 0, nil +} + +func (d *scyllaDB) Commit(keys []string) error { + return nil +} + func New(conf *config.Config, stats stats.Stats) (*scyllaDB, error) { cluster := gocql.NewCluster(conf.GetString("Scylla.Hosts", "localhost:9042")) cluster.Consistency = gocql.Quorum From 64fdfd54550b877a9e32c1c01ca7fe89da293dce Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Fri, 26 Jul 2024 12:37:12 +0530 Subject: [PATCH 06/16] chore: refactor dedup --- mocks/services/dedup/mock_dedup.go | 2 +- processor/processor.go | 11 +- processor/processor_test.go | 12 +- processor/worker.go | 4 +- services/dedup/badger/badger.go | 17 +-- services/dedup/dedup.go | 21 ++- services/dedup/dedup_test.go | 45 +++--- services/dedup/mirrorBadger/mirrorBadger.go | 40 ++++++ services/dedup/mirrorScylla/mirrorScylla.go | 40 ++++++ services/dedup/scylla/scylla.go | 146 ++++++++------------ 10 files changed, 203 insertions(+), 135 deletions(-) create mode 100644 services/dedup/mirrorBadger/mirrorBadger.go create mode 100644 services/dedup/mirrorScylla/mirrorScylla.go diff --git a/mocks/services/dedup/mock_dedup.go b/mocks/services/dedup/mock_dedup.go index db321ec21e..17050566b7 100644 --- a/mocks/services/dedup/mock_dedup.go +++ b/mocks/services/dedup/mock_dedup.go @@ -52,7 +52,7 @@ func (mr *MockDedupMockRecorder) Close() *gomock.Call { } // Commit mocks base method. -func (m *MockDedup) Commit(arg0 []string) error { +func (m *MockDedup) Commit(arg0 map[string]types.KeyValue) error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "Commit", arg0) ret0, _ := ret[0].(error) diff --git a/processor/processor.go b/processor/processor.go index 6f3aab4198..55818feb91 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -1603,7 +1603,7 @@ func (proc *Handle) processJobsForDest(partition string, subJobs subJob) *transf proc.logger.Debug("[Processor] Total jobs picked up : ", len(jobList)) marshalStart := time.Now() - dedupKeys := make(map[string]struct{}) + dedupKeys := make(map[string]dedupTypes.KeyValue) uniqueMessageIdsBySrcDestKey := make(map[string]map[string]struct{}) sourceDupStats := make(map[dupStatKey]int) @@ -1713,6 +1713,7 @@ func (proc *Handle) processJobsForDest(partition string, subJobs subJob) *transf p := payloadFunc() messageSize := int64(len(p)) dedupKey := fmt.Sprintf("%v%v", messageId, eventParams.SourceJobRunId) + dedupVal := dedupTypes.KeyValue{Key: dedupKey, Value: messageSize, WorkspaceId: batchEvent.WorkspaceId} ok, previousSize, err := proc.dedup.Set(dedupTypes.KeyValue{Key: dedupKey, Value: messageSize, WorkspaceId: batchEvent.WorkspaceId}) if err != nil { panic(err) @@ -1722,7 +1723,7 @@ func (proc *Handle) processJobsForDest(partition string, subJobs subJob) *transf sourceDupStats[dupStatKey{sourceID: source.ID, equalSize: messageSize == previousSize}] += 1 continue } - dedupKeys[dedupKey] = struct{}{} + dedupKeys[dedupKey] = dedupVal } proc.updateSourceEventStatsDetailed(singularEvent, sourceID) @@ -2086,7 +2087,7 @@ type transformationMessage struct { statusList []*jobsdb.JobStatusT procErrorJobs []*jobsdb.JobT sourceDupStats map[dupStatKey]int - dedupKeys map[string]struct{} + dedupKeys map[string]dedupTypes.KeyValue totalEvents int start time.Time @@ -2223,7 +2224,7 @@ type storeMessage struct { reportMetrics []*types.PUReportedMetric sourceDupStats map[dupStatKey]int - dedupKeys map[string]struct{} + dedupKeys map[string]dedupTypes.KeyValue totalEvents int start time.Time @@ -2439,7 +2440,7 @@ func (proc *Handle) Store(partition string, in *storeMessage) { if proc.config.enableDedup { proc.updateSourceStats(in.sourceDupStats, "processor_write_key_duplicate_events") if len(in.dedupKeys) > 0 { - if err := proc.dedup.Commit(lo.Keys(in.dedupKeys)); err != nil { + if err := proc.dedup.Commit(in.dedupKeys); err != nil { panic(err) } } diff --git a/processor/processor_test.go b/processor/processor_test.go index 583435c930..7d2e950bb7 100644 --- a/processor/processor_test.go +++ b/processor/processor_test.go @@ -12,6 +12,8 @@ import ( "testing" "time" + dedupTypes "github.com/rudderlabs/rudder-server/services/dedup/types" + "github.com/samber/lo" "github.com/rudderlabs/rudder-server/enterprise/trackedusers" @@ -5541,7 +5543,7 @@ func TestStoreMessageMerge(t *testing.T) { routerDestIDs: []string{"1"}, reportMetrics: []*types.PUReportedMetric{{}}, sourceDupStats: map[dupStatKey]int{{sourceID: "1"}: 1}, - dedupKeys: map[string]struct{}{"1": {}}, + dedupKeys: map[string]dedupTypes.KeyValue{"1": {}}, totalEvents: 1, trackedUsersReports: []*trackedusers.UsersReport{{WorkspaceID: sampleWorkspaceID}}, } @@ -5557,7 +5559,7 @@ func TestStoreMessageMerge(t *testing.T) { routerDestIDs: []string{"2"}, reportMetrics: []*types.PUReportedMetric{{}}, sourceDupStats: map[dupStatKey]int{{sourceID: "1"}: 2}, - dedupKeys: map[string]struct{}{"2": {}}, + dedupKeys: map[string]dedupTypes.KeyValue{"2": {}}, totalEvents: 1, trackedUsersReports: []*trackedusers.UsersReport{{WorkspaceID: sampleWorkspaceID}, {WorkspaceID: sampleWorkspaceID}}, } @@ -5575,7 +5577,7 @@ func TestStoreMessageMerge(t *testing.T) { []string{"3"}, []*types.PUReportedMetric{{}}, map[dupStatKey]int{{sourceID: "1"}: 3}, - map[string]struct{}{"3": {}}, + map[string]dedupTypes.KeyValue{"3": {}}, 1, time.Time{}, false, @@ -5586,7 +5588,7 @@ func TestStoreMessageMerge(t *testing.T) { merged := storeMessage{ procErrorJobsByDestID: map[string][]*jobsdb.JobT{}, sourceDupStats: map[dupStatKey]int{}, - dedupKeys: map[string]struct{}{}, + dedupKeys: map[string]dedupTypes.KeyValue{}, start: time.UnixMicro(99999999), } @@ -5639,7 +5641,7 @@ func TestStoreMessageMerge(t *testing.T) { routerDestIDs: []string{"1", "2", "3"}, reportMetrics: []*types.PUReportedMetric{{}, {}, {}}, sourceDupStats: map[dupStatKey]int{{sourceID: "1"}: 6}, - dedupKeys: map[string]struct{}{"1": {}, "2": {}, "3": {}}, + dedupKeys: map[string]dedupTypes.KeyValue{"1": {}, "2": {}, "3": {}}, totalEvents: 3, start: time.UnixMicro(99999999), }) diff --git a/processor/worker.go b/processor/worker.go index 6192eb663f..30c0f79816 100644 --- a/processor/worker.go +++ b/processor/worker.go @@ -5,6 +5,8 @@ import ( "sync" "time" + dedupTypes "github.com/rudderlabs/rudder-server/services/dedup/types" + "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-server/jobsdb" "github.com/rudderlabs/rudder-server/rruntime" @@ -104,7 +106,7 @@ func (w *worker) start() { if firstSubJob { mergedJob = &storeMessage{ rsourcesStats: subJob.rsourcesStats, - dedupKeys: make(map[string]struct{}), + dedupKeys: make(map[string]dedupTypes.KeyValue), procErrorJobsByDestID: make(map[string][]*jobsdb.JobT), sourceDupStats: make(map[dupStatKey]int), start: subJob.start, diff --git a/services/dedup/badger/badger.go b/services/dedup/badger/badger.go index 17f1a66e2d..430768724f 100644 --- a/services/dedup/badger/badger.go +++ b/services/dedup/badger/badger.go @@ -8,6 +8,7 @@ import ( "github.com/dgraph-io/badger/v4" "github.com/dgraph-io/badger/v4/options" + "github.com/samber/lo" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" @@ -40,9 +41,9 @@ func DefaultPath() string { return fmt.Sprintf(`%v%v`, tmpDirPath, badgerPathName) } -func NewBadgerDB(conf *config.Config, stats stats.Stats, path string) *dedup { +func NewBadgerDB(conf *config.Config, stats stats.Stats, path string) *Dedup { dedupWindow := conf.GetReloadableDurationVar(3600, time.Second, "Dedup.dedupWindow", "Dedup.dedupWindowInS") - log := logger.NewLogger().Child("dedup") + log := logger.NewLogger().Child("Dedup") badgerOpts := badger. DefaultOptions(path). WithCompression(options.None). @@ -66,7 +67,7 @@ func NewBadgerDB(conf *config.Config, stats stats.Stats, path string) *dedup { window: dedupWindow, opts: badgerOpts, } - return &dedup{ + return &Dedup{ badgerDB: db, cache: make(map[string]int64), } @@ -173,13 +174,13 @@ func (d *BadgerDB) gcLoop() { } } -type dedup struct { +type Dedup struct { badgerDB *BadgerDB cacheMu sync.Mutex cache map[string]int64 } -func (d *dedup) Set(kv types.KeyValue) (bool, int64, error) { +func (d *Dedup) Set(kv types.KeyValue) (bool, int64, error) { d.cacheMu.Lock() defer d.cacheMu.Unlock() if previous, found := d.cache[kv.Key]; found { @@ -195,12 +196,12 @@ func (d *dedup) Set(kv types.KeyValue) (bool, int64, error) { return !found, previous, nil } -func (d *dedup) Commit(keys []string) error { +func (d *Dedup) Commit(keys map[string]types.KeyValue) error { d.cacheMu.Lock() defer d.cacheMu.Unlock() kvs := make([]types.KeyValue, len(keys)) - for i, key := range keys { + for i, key := range lo.Keys(keys) { value, ok := d.cache[key] if !ok { return fmt.Errorf("key %v has not been previously set", key) @@ -217,7 +218,7 @@ func (d *dedup) Commit(keys []string) error { return err } -func (d *dedup) Close() { +func (d *Dedup) Close() { d.badgerDB.Close() } diff --git a/services/dedup/dedup.go b/services/dedup/dedup.go index de81ff5f13..93e8f73bb5 100644 --- a/services/dedup/dedup.go +++ b/services/dedup/dedup.go @@ -6,6 +6,8 @@ import ( "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/stats" "github.com/rudderlabs/rudder-server/services/dedup/badger" + "github.com/rudderlabs/rudder-server/services/dedup/mirrorBadger" + "github.com/rudderlabs/rudder-server/services/dedup/mirrorScylla" "github.com/rudderlabs/rudder-server/services/dedup/scylla" "github.com/rudderlabs/rudder-server/services/dedup/types" ) @@ -13,9 +15,10 @@ import ( type Mode string const ( - Badger Mode = "Badger" - Scylla Mode = "Scylla" - Dual Mode = "Dual" + Badger Mode = "Badger" + Scylla Mode = "Scylla" + MirrorScylla Mode = "MirrorScylla" + MirrorBadger Mode = "MirrorBadger" ) // New creates a new deduplication service. The service needs to be closed after use. @@ -30,8 +33,14 @@ func New(conf *config.Config, stats stats.Stats) (Dedup, error) { return nil, err } return scylla, nil - case Dual: - return nil, nil + case MirrorScylla: + // Writes happen to both + // Read only from Scylla + return mirrorScylla.NewMirrorScylla(conf, stats) + case MirrorBadger: + // Writes happen to both + // Read only from Badger + return mirrorBadger.NewMirrorBadger(conf, stats) default: return badger.NewBadgerDB(conf, stats, badger.DefaultPath()), nil } @@ -43,7 +52,7 @@ type Dedup interface { Set(kv types.KeyValue) (bool, int64, error) // Commit commits a list of previously set keys to the DB - Commit(keys []string) error + Commit(keys map[string]types.KeyValue) error // Close closes the deduplication service Close() diff --git a/services/dedup/dedup_test.go b/services/dedup/dedup_test.go index 154c861eef..b113a2fbdb 100644 --- a/services/dedup/dedup_test.go +++ b/services/dedup/dedup_test.go @@ -28,7 +28,9 @@ func Test_Dedup(t *testing.T) { dbPath := os.TempDir() + "/dedup_test" defer func() { _ = os.RemoveAll(dbPath) }() _ = os.RemoveAll(dbPath) - d, err := dedup.New(config.New(), stats.Default) + conf := config.New() + t.Setenv("RUDDER_TMPDIR", dbPath) + d, err := dedup.New(conf, stats.Default) require.Nil(t, err) defer d.Close() @@ -49,7 +51,7 @@ func Test_Dedup(t *testing.T) { require.Nil(t, err) require.Equal(t, true, found) - err = d.Commit([]string{"a"}) + err = d.Commit(map[string]types.KeyValue{"a": {Key: "a", Value: 1}}) require.NoError(t, err) found, value, err := d.Set(types.KeyValue{Key: "b", Value: 2}) @@ -63,7 +65,7 @@ func Test_Dedup(t *testing.T) { require.Nil(t, err) require.Equal(t, true, found) - err = d.Commit([]string{"d"}) + err = d.Commit(map[string]types.KeyValue{"d": {Key: "d", Value: 2}}) require.NotNil(t, err) }) } @@ -74,10 +76,12 @@ func Test_Dedup_Window(t *testing.T) { misc.Init() dbPath := os.TempDir() + "/dedup_test" + conf := config.New() defer func() { _ = os.RemoveAll(dbPath) }() _ = os.RemoveAll(dbPath) - config.Set("Dedup.dedupWindow", "1s") - d, err := dedup.New(config.New(), stats.Default) + conf.Set("Dedup.dedupWindow", "1s") + t.Setenv("RUDDER_TMPDIR", dbPath) + d, err := dedup.New(conf, stats.Default) require.Nil(t, err) defer d.Close() @@ -85,7 +89,9 @@ func Test_Dedup_Window(t *testing.T) { require.Nil(t, err) require.Equal(t, true, found) - err = d.Commit([]string{"to be deleted"}) + key := "to be deleted" + value := types.KeyValue{Key: key, Value: 2} + err = d.Commit(map[string]types.KeyValue{key: value}) require.NoError(t, err) found, _, err = d.Set(types.KeyValue{Key: "to be deleted", Value: 2}) @@ -107,15 +113,18 @@ func Test_Dedup_ErrTxnTooBig(t *testing.T) { dbPath := os.TempDir() + "/dedup_test_errtxntoobig" defer os.RemoveAll(dbPath) os.RemoveAll(dbPath) - d, err := dedup.New(config.New(), stats.Default) + conf := config.New() + t.Setenv("RUDDER_TMPDIR", dbPath) + d, err := dedup.New(conf, stats.Default) require.Nil(t, err) defer d.Close() size := 105_000 - messages := make([]string, size) + messages := map[string]types.KeyValue{} for i := 0; i < size; i++ { - messages[i] = uuid.New().String() - _, _, _ = d.Set(types.KeyValue{Key: messages[i], Value: int64(i + 1)}) + key := uuid.New().String() + messages[key] = types.KeyValue{Key: key, Value: int64(i + 1)} + _, _, _ = d.Set(messages[key]) } err = d.Commit(messages) require.NoError(t, err) @@ -130,29 +139,31 @@ func Benchmark_Dedup(b *testing.B) { b.Logf("using path %s, since tmpDir has issues in macOS\n", dbPath) defer func() { _ = os.RemoveAll(dbPath) }() _ = os.MkdirAll(dbPath, 0o750) - d, err := dedup.New(config.New(), stats.Default) + conf := config.New() + b.Setenv("RUDDER_TMPDIR", dbPath) + d, err := dedup.New(conf, stats.Default) require.NoError(b, err) b.Run("no duplicates 1000 batch unique", func(b *testing.B) { batchSize := 1000 msgIDs := make([]types.KeyValue, batchSize) - keys := make([]string, 0) + messages := map[string]types.KeyValue{} for i := 0; i < b.N; i++ { + key := uuid.New().String() msgIDs[i%batchSize] = types.KeyValue{ - Key: uuid.New().String(), + Key: key, Value: int64(i + 1), } - + messages[key] = msgIDs[i%batchSize] if i%batchSize == batchSize-1 || i == b.N-1 { for _, msgID := range msgIDs[:i%batchSize] { _, _, _ = d.Set(msgID) - keys = append(keys, msgID.Key) } - err := d.Commit(keys) + err := d.Commit(messages) require.NoError(b, err) - keys = nil + messages = nil } } b.ReportMetric(float64(b.N), "events") diff --git a/services/dedup/mirrorBadger/mirrorBadger.go b/services/dedup/mirrorBadger/mirrorBadger.go new file mode 100644 index 0000000000..b1d7045e5f --- /dev/null +++ b/services/dedup/mirrorBadger/mirrorBadger.go @@ -0,0 +1,40 @@ +package mirrorBadger + +import ( + "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-server/services/dedup/badger" + "github.com/rudderlabs/rudder-server/services/dedup/scylla" + "github.com/rudderlabs/rudder-server/services/dedup/types" +) + +type MirrorBadger struct { + badger *badger.Dedup + scylla *scylla.ScyllaDB +} + +func NewMirrorBadger(conf *config.Config, stats stats.Stats) (*MirrorBadger, error) { + badger := badger.NewBadgerDB(conf, stats, badger.DefaultPath()) + scylla, err := scylla.New(conf, stats) + if err != nil { + return nil, err + } + return &MirrorBadger{ + badger: badger, + scylla: scylla, + }, nil +} + +func (mb *MirrorBadger) Close() { + mb.scylla.Close() + mb.badger.Close() +} + +func (mb *MirrorBadger) Set(kv types.KeyValue) (bool, int64, error) { + return mb.badger.Set(kv) +} + +func (mb *MirrorBadger) Commit(keys map[string]types.KeyValue) error { + _ = mb.scylla.Commit(keys) + return mb.badger.Commit(keys) +} diff --git a/services/dedup/mirrorScylla/mirrorScylla.go b/services/dedup/mirrorScylla/mirrorScylla.go new file mode 100644 index 0000000000..3a6b9e7062 --- /dev/null +++ b/services/dedup/mirrorScylla/mirrorScylla.go @@ -0,0 +1,40 @@ +package mirrorScylla + +import ( + "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-server/services/dedup/badger" + "github.com/rudderlabs/rudder-server/services/dedup/scylla" + "github.com/rudderlabs/rudder-server/services/dedup/types" +) + +type MirrorScylla struct { + badger *badger.Dedup + scylla *scylla.ScyllaDB +} + +func NewMirrorScylla(conf *config.Config, stats stats.Stats) (*MirrorScylla, error) { + badger := badger.NewBadgerDB(conf, stats, badger.DefaultPath()) + scylla, err := scylla.New(conf, stats) + if err != nil { + return nil, err + } + return &MirrorScylla{ + badger: badger, + scylla: scylla, + }, nil +} + +func (ms *MirrorScylla) Close() { + ms.scylla.Close() + ms.badger.Close() +} + +func (ms *MirrorScylla) Set(kv types.KeyValue) (bool, int64, error) { + return ms.scylla.Set(kv) +} + +func (ms *MirrorScylla) Commit(keys map[string]types.KeyValue) error { + _ = ms.badger.Commit(keys) + return ms.scylla.Commit(keys) +} diff --git a/services/dedup/scylla/scylla.go b/services/dedup/scylla/scylla.go index 204fafcd23..7671b08c15 100644 --- a/services/dedup/scylla/scylla.go +++ b/services/dedup/scylla/scylla.go @@ -1,131 +1,93 @@ package scylla import ( - "context" "fmt" - "github.com/samber/lo" - "sync" - "sync/atomic" "time" "github.com/gocql/gocql" - "github.com/google/uuid" + "github.com/samber/lo" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/stats" "github.com/rudderlabs/rudder-server/services/dedup/types" ) -var ( - keySpace = "scylla" - loadTable = "dedup" -) - -type scyllaDB struct { +type ScyllaDB struct { scylla *gocql.Session conf *config.Config stat stats.Stats - timeout time.Duration ttl int - createTableMap map[string]sync.Once + batchSize int + createTableMap map[string]struct{} } -func (d *scyllaDB) PopulateBatch() (time.Duration, error) { - startTime := time.Now() - if err := d.scylla.Query(fmt.Sprintf("CREATE KEYSPACE IF NOT EXISTS %s with replication = {'class': 'SimpleStrategy', 'replication_factor': 3}", keySpace)).Exec(); err != nil { - return 0, err - } - fmt.Println("Created Keyspace") - tableName := fmt.Sprintf("%s.%s", keySpace, loadTable) - if err := d.scylla.Query(fmt.Sprintf( - `CREATE TABLE IF NOT EXISTS %s ( - id uuid PRIMARY KEY) WITH bloom_filter_fp_chance = 0.005 and default_time_to_live = %d;`, tableName, d.ttl, - )).Exec(); err != nil { - return 0, err - } - fmt.Println("Created Table") - totalWorkspaces := d.conf.GetInt("ScyllaDB.totalWorkspaces", 10) - batchSize := d.conf.GetInt("ScyllaDB.batchSize", 20) - var totalCount atomic.Int64 - var wg sync.WaitGroup - for i := 0; i < totalWorkspaces; i++ { - fmt.Println("Starting workspace", i) - errorStat := d.stat.NewTaggedStat("scylla_batch_write_error", stats.CountType, stats.Tags{}) - countStat := d.stat.NewTaggedStat("scylla_batch_write_count", stats.CountType, stats.Tags{}) - wg.Add(1) - go func() { - defer wg.Done() - totalParallelGoRoutines := d.conf.GetInt("ScyllaDB.totalParallelGoRoutines", 1) - for j := 0; j < totalParallelGoRoutines; j++ { - batch := d.scylla.NewBatch(gocql.LoggedBatch).WithContext(context.Background()) - fmt.Println("Starting parallel go routine", j) - go func() { - for { - time.Sleep(4 * time.Millisecond) - uid := uuid.New().String() - batch.Entries = append(batch.Entries, gocql.BatchEntry{ - Stmt: fmt.Sprintf("INSERT INTO %s.%s (id) VALUES (?)", keySpace, loadTable), - Args: []interface{}{uid}, - }) - totalCount.Add(1) - if len(batch.Entries) == batchSize { - if err := d.scylla.ExecuteBatch(batch); err != nil { - fmt.Printf("failed executing batch into scylla with %v and batch size %d", err, batch.Size()) - errorStat.Increment() - continue - } - countStat.Count(len(batch.Entries)) - batch = d.scylla.NewBatch(gocql.LoggedBatch).WithContext(context.Background()) - } - } - }() - } - }() - } - wg.Wait() - return time.Since(startTime), nil +func (d *ScyllaDB) Close() { + d.scylla.Close() } -func (d *scyllaDB) BenchmarkNewKeys() { - var key string - err := d.scylla.Query("SELECT id FROM scylla.dedup WHERE id = ?", uuid.New().String()).Scan(&key) +func (d *ScyllaDB) Set(kv types.KeyValue) (bool, int64, error) { + // Create the table if it doesn't exist + if _, ok := d.createTableMap[kv.WorkspaceId]; !ok { + err := d.scylla.Query(fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s (id text PRIMARY KEY, value bigint) WITH bloom_filter_fp_chance = 0.005", kv.WorkspaceId)).Exec() + if err != nil { + return false, 0, err + } + d.createTableMap[kv.WorkspaceId] = struct{}{} + } + + // Check if the key exists in the DB + var value int64 + err := d.scylla.Query(fmt.Sprintf("SELECT value FROM %s WHERE id = ?", kv.WorkspaceId), kv.Key).Scan(&value) if err != nil && err != gocql.ErrNotFound { - fmt.Println("Error closing iterator with", err) + return false, 0, err } - if key != "" { - fmt.Println("key exists") + if err == gocql.ErrNotFound { + return true, 0, nil } + return false, kv.Value, nil } -func (d *scyllaDB) Close() { - d.scylla.Close() -} - -func (d *scyllaDB) Set(kv types.KeyValue) (bool, int64, error) { - return false, 0, nil -} - -func (d *scyllaDB) Commit(keys []types.KeyValue) error { - keyList := lo.PartitionBy(keys, func(kv types.KeyValue) string { +func (d *ScyllaDB) Commit(keys map[string]types.KeyValue) error { + keysList := lo.PartitionBy(lo.Values(keys), func(kv types.KeyValue) string { return kv.WorkspaceId }) - d.scylla.NewBatch(gocql.LoggedBatch).WithContext(context.Background()) + for _, keys := range keysList { + batches := lo.Chunk(keys, d.batchSize) + for _, batch := range batches { + scyllaBatch := d.scylla.NewBatch(gocql.LoggedBatch) + for _, key := range batch { + scyllaBatch.Entries = append(scyllaBatch.Entries, gocql.BatchEntry{ + Stmt: fmt.Sprintf("INSERT INTO %s (id) VALUES (?,?) USING TTL %d", key.WorkspaceId, d.ttl), + Args: []interface{}{key.Key, key.Value}, + }) + } + if err := d.scylla.ExecuteBatch(scyllaBatch); err != nil { + return err + } + } + } return nil } -func New(conf *config.Config, stats stats.Stats) (*scyllaDB, error) { +func New(conf *config.Config, stats stats.Stats) (*ScyllaDB, error) { cluster := gocql.NewCluster(conf.GetString("Scylla.Hosts", "localhost:9042")) cluster.Consistency = gocql.Quorum - + cluster.Keyspace = config.GetString("Scylla.Keyspace", "dedup") + cluster.RetryPolicy = &gocql.ExponentialBackoffRetryPolicy{ + NumRetries: config.GetInt("Scylla.NumRetries", 3), + Min: conf.GetDuration("Scylla.MinRetry", 100, time.Millisecond), + Max: conf.GetDuration("Scylla.MaxRetry", 2000, time.Second), + } + cluster.Timeout = conf.GetDuration("Scylla.Timeout", 10, time.Second) session, err := cluster.CreateSession() if err != nil { return nil, err } - return &scyllaDB{ - scylla: session, - conf: conf, - stat: stats, - timeout: conf.GetDuration("Scylla.Timeout", 10, time.Second), - ttl: conf.GetInt("Scylla.TTL", 1209600), + return &ScyllaDB{ + scylla: session, + conf: conf, + stat: stats, + ttl: conf.GetInt("Scylla.TTL", 1209600), // TTL is defaulted to seconds + batchSize: conf.GetInt("Scylla.BatchSize", 100), }, nil } From 0dfb323879cd3583e8c7e9b3cb176d16f0a94cb3 Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Mon, 29 Jul 2024 14:38:24 +0530 Subject: [PATCH 07/16] chore: review comments --- mocks/services/dedup/mock_dedup.go | 14 +++--- processor/processor.go | 13 +++--- processor/processor_test.go | 22 ++++----- processor/worker.go | 4 +- services/dedup/badger/badger.go | 8 ++-- services/dedup/dedup.go | 6 +-- services/dedup/dedup_test.go | 42 ++++++++--------- services/dedup/mirrorBadger/mirrorBadger.go | 6 +-- services/dedup/mirrorScylla/mirrorScylla.go | 6 +-- services/dedup/scylla/scylla.go | 50 ++++++++++++++++----- 10 files changed, 92 insertions(+), 79 deletions(-) diff --git a/mocks/services/dedup/mock_dedup.go b/mocks/services/dedup/mock_dedup.go index 17050566b7..18d2cba3af 100644 --- a/mocks/services/dedup/mock_dedup.go +++ b/mocks/services/dedup/mock_dedup.go @@ -52,7 +52,7 @@ func (mr *MockDedupMockRecorder) Close() *gomock.Call { } // Commit mocks base method. -func (m *MockDedup) Commit(arg0 map[string]types.KeyValue) error { +func (m *MockDedup) Commit(arg0 []string) error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "Commit", arg0) ret0, _ := ret[0].(error) @@ -65,18 +65,18 @@ func (mr *MockDedupMockRecorder) Commit(arg0 any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Commit", reflect.TypeOf((*MockDedup)(nil).Commit), arg0) } -// Set mocks base method. -func (m *MockDedup) Set(arg0 types.KeyValue) (bool, int64, error) { +// Get mocks base method. +func (m *MockDedup) Get(arg0 types.KeyValue) (bool, int64, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "Set", arg0) + ret := m.ctrl.Call(m, "Get", arg0) ret0, _ := ret[0].(bool) ret1, _ := ret[1].(int64) ret2, _ := ret[2].(error) return ret0, ret1, ret2 } -// Set indicates an expected call of Set. -func (mr *MockDedupMockRecorder) Set(arg0 any) *gomock.Call { +// Get indicates an expected call of Get. +func (mr *MockDedupMockRecorder) Get(arg0 any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Set", reflect.TypeOf((*MockDedup)(nil).Set), arg0) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Get", reflect.TypeOf((*MockDedup)(nil).Get), arg0) } diff --git a/processor/processor.go b/processor/processor.go index 55818feb91..3783a33a71 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -1603,7 +1603,7 @@ func (proc *Handle) processJobsForDest(partition string, subJobs subJob) *transf proc.logger.Debug("[Processor] Total jobs picked up : ", len(jobList)) marshalStart := time.Now() - dedupKeys := make(map[string]dedupTypes.KeyValue) + dedupKeys := make(map[string]struct{}) uniqueMessageIdsBySrcDestKey := make(map[string]map[string]struct{}) sourceDupStats := make(map[dupStatKey]int) @@ -1713,8 +1713,7 @@ func (proc *Handle) processJobsForDest(partition string, subJobs subJob) *transf p := payloadFunc() messageSize := int64(len(p)) dedupKey := fmt.Sprintf("%v%v", messageId, eventParams.SourceJobRunId) - dedupVal := dedupTypes.KeyValue{Key: dedupKey, Value: messageSize, WorkspaceId: batchEvent.WorkspaceId} - ok, previousSize, err := proc.dedup.Set(dedupTypes.KeyValue{Key: dedupKey, Value: messageSize, WorkspaceId: batchEvent.WorkspaceId}) + ok, previousSize, err := proc.dedup.Get(dedupTypes.KeyValue{Key: dedupKey, Value: messageSize, WorkspaceId: batchEvent.WorkspaceId}) if err != nil { panic(err) } @@ -1723,7 +1722,7 @@ func (proc *Handle) processJobsForDest(partition string, subJobs subJob) *transf sourceDupStats[dupStatKey{sourceID: source.ID, equalSize: messageSize == previousSize}] += 1 continue } - dedupKeys[dedupKey] = dedupVal + dedupKeys[dedupKey] = struct{}{} } proc.updateSourceEventStatsDetailed(singularEvent, sourceID) @@ -2087,7 +2086,7 @@ type transformationMessage struct { statusList []*jobsdb.JobStatusT procErrorJobs []*jobsdb.JobT sourceDupStats map[dupStatKey]int - dedupKeys map[string]dedupTypes.KeyValue + dedupKeys map[string]struct{} totalEvents int start time.Time @@ -2224,7 +2223,7 @@ type storeMessage struct { reportMetrics []*types.PUReportedMetric sourceDupStats map[dupStatKey]int - dedupKeys map[string]dedupTypes.KeyValue + dedupKeys map[string]struct{} totalEvents int start time.Time @@ -2440,7 +2439,7 @@ func (proc *Handle) Store(partition string, in *storeMessage) { if proc.config.enableDedup { proc.updateSourceStats(in.sourceDupStats, "processor_write_key_duplicate_events") if len(in.dedupKeys) > 0 { - if err := proc.dedup.Commit(in.dedupKeys); err != nil { + if err := proc.dedup.Commit(lo.Keys(in.dedupKeys)); err != nil { panic(err) } } diff --git a/processor/processor_test.go b/processor/processor_test.go index 7d2e950bb7..dd4b7f3109 100644 --- a/processor/processor_test.go +++ b/processor/processor_test.go @@ -12,16 +12,11 @@ import ( "testing" "time" - dedupTypes "github.com/rudderlabs/rudder-server/services/dedup/types" - - "github.com/samber/lo" - - "github.com/rudderlabs/rudder-server/enterprise/trackedusers" - "github.com/google/uuid" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/onsi/gomega/format" + "github.com/samber/lo" "github.com/stretchr/testify/require" "github.com/tidwall/gjson" "github.com/tidwall/sjson" @@ -34,6 +29,7 @@ import ( "github.com/rudderlabs/rudder-server/admin" backendconfig "github.com/rudderlabs/rudder-server/backend-config" + "github.com/rudderlabs/rudder-server/enterprise/trackedusers" "github.com/rudderlabs/rudder-server/internal/enricher" "github.com/rudderlabs/rudder-server/jobsdb" mocksBackendConfig "github.com/rudderlabs/rudder-server/mocks/backend-config" @@ -80,7 +76,7 @@ type mockTrackedUsersReporter struct { } } -func (m *mockTrackedUsersReporter) ReportUsers(ctx context.Context, reports []*trackedusers.UsersReport, tx *Tx) error { +func (m *mockTrackedUsersReporter) ReportUsers(_ context.Context, reports []*trackedusers.UsersReport, _ *Tx) error { m.reportCalls = append(m.reportCalls, struct { reportedReports []*trackedusers.UsersReport }{reportedReports: reports}) @@ -2538,7 +2534,7 @@ var _ = Describe("Processor", Ordered, func() { mockTransformer := mocksTransformer.NewMockTransformer(c.mockCtrl) callUnprocessed := c.mockGatewayJobsDB.EXPECT().GetUnprocessed(gomock.Any(), gomock.Any()).Return(jobsdb.JobsResult{Jobs: unprocessedJobsList}, nil).Times(1) - c.MockDedup.EXPECT().Set(gomock.Any()).Return(true, int64(0), nil).After(callUnprocessed).Times(3) + c.MockDedup.EXPECT().Get(gomock.Any()).Return(true, int64(0), nil).After(callUnprocessed).Times(3) c.MockDedup.EXPECT().Commit(gomock.Any()).Times(1) // We expect one transform call to destination A, after callUnprocessed. @@ -5543,7 +5539,7 @@ func TestStoreMessageMerge(t *testing.T) { routerDestIDs: []string{"1"}, reportMetrics: []*types.PUReportedMetric{{}}, sourceDupStats: map[dupStatKey]int{{sourceID: "1"}: 1}, - dedupKeys: map[string]dedupTypes.KeyValue{"1": {}}, + dedupKeys: map[string]struct{}{"1": {}}, totalEvents: 1, trackedUsersReports: []*trackedusers.UsersReport{{WorkspaceID: sampleWorkspaceID}}, } @@ -5559,7 +5555,7 @@ func TestStoreMessageMerge(t *testing.T) { routerDestIDs: []string{"2"}, reportMetrics: []*types.PUReportedMetric{{}}, sourceDupStats: map[dupStatKey]int{{sourceID: "1"}: 2}, - dedupKeys: map[string]dedupTypes.KeyValue{"2": {}}, + dedupKeys: map[string]struct{}{"2": {}}, totalEvents: 1, trackedUsersReports: []*trackedusers.UsersReport{{WorkspaceID: sampleWorkspaceID}, {WorkspaceID: sampleWorkspaceID}}, } @@ -5577,7 +5573,7 @@ func TestStoreMessageMerge(t *testing.T) { []string{"3"}, []*types.PUReportedMetric{{}}, map[dupStatKey]int{{sourceID: "1"}: 3}, - map[string]dedupTypes.KeyValue{"3": {}}, + map[string]struct{}{"3": {}}, 1, time.Time{}, false, @@ -5588,7 +5584,7 @@ func TestStoreMessageMerge(t *testing.T) { merged := storeMessage{ procErrorJobsByDestID: map[string][]*jobsdb.JobT{}, sourceDupStats: map[dupStatKey]int{}, - dedupKeys: map[string]dedupTypes.KeyValue{}, + dedupKeys: map[string]struct{}{}, start: time.UnixMicro(99999999), } @@ -5641,7 +5637,7 @@ func TestStoreMessageMerge(t *testing.T) { routerDestIDs: []string{"1", "2", "3"}, reportMetrics: []*types.PUReportedMetric{{}, {}, {}}, sourceDupStats: map[dupStatKey]int{{sourceID: "1"}: 6}, - dedupKeys: map[string]dedupTypes.KeyValue{"1": {}, "2": {}, "3": {}}, + dedupKeys: map[string]struct{}{"1": {}, "2": {}, "3": {}}, totalEvents: 3, start: time.UnixMicro(99999999), }) diff --git a/processor/worker.go b/processor/worker.go index 30c0f79816..6192eb663f 100644 --- a/processor/worker.go +++ b/processor/worker.go @@ -5,8 +5,6 @@ import ( "sync" "time" - dedupTypes "github.com/rudderlabs/rudder-server/services/dedup/types" - "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-server/jobsdb" "github.com/rudderlabs/rudder-server/rruntime" @@ -106,7 +104,7 @@ func (w *worker) start() { if firstSubJob { mergedJob = &storeMessage{ rsourcesStats: subJob.rsourcesStats, - dedupKeys: make(map[string]dedupTypes.KeyValue), + dedupKeys: make(map[string]struct{}), procErrorJobsByDestID: make(map[string][]*jobsdb.JobT), sourceDupStats: make(map[dupStatKey]int), start: subJob.start, diff --git a/services/dedup/badger/badger.go b/services/dedup/badger/badger.go index 430768724f..a61dd9fe72 100644 --- a/services/dedup/badger/badger.go +++ b/services/dedup/badger/badger.go @@ -8,7 +8,6 @@ import ( "github.com/dgraph-io/badger/v4" "github.com/dgraph-io/badger/v4/options" - "github.com/samber/lo" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" @@ -180,7 +179,7 @@ type Dedup struct { cache map[string]int64 } -func (d *Dedup) Set(kv types.KeyValue) (bool, int64, error) { +func (d *Dedup) Get(kv types.KeyValue) (bool, int64, error) { d.cacheMu.Lock() defer d.cacheMu.Unlock() if previous, found := d.cache[kv.Key]; found { @@ -196,12 +195,11 @@ func (d *Dedup) Set(kv types.KeyValue) (bool, int64, error) { return !found, previous, nil } -func (d *Dedup) Commit(keys map[string]types.KeyValue) error { +func (d *Dedup) Commit(keys []string) error { d.cacheMu.Lock() defer d.cacheMu.Unlock() - kvs := make([]types.KeyValue, len(keys)) - for i, key := range lo.Keys(keys) { + for i, key := range keys { value, ok := d.cache[key] if !ok { return fmt.Errorf("key %v has not been previously set", key) diff --git a/services/dedup/dedup.go b/services/dedup/dedup.go index 93e8f73bb5..c602d471ab 100644 --- a/services/dedup/dedup.go +++ b/services/dedup/dedup.go @@ -48,11 +48,11 @@ func New(conf *config.Config, stats stats.Stats) (Dedup, error) { // Dedup is the interface for deduplication service type Dedup interface { - // Set returns [true] if it was the first time the key was encountered, otherwise it returns [false] along with the previous value - Set(kv types.KeyValue) (bool, int64, error) + // Get returns [true] if it was the first time the key was encountered, otherwise it returns [false] along with the previous value + Get(kv types.KeyValue) (bool, int64, error) // Commit commits a list of previously set keys to the DB - Commit(keys map[string]types.KeyValue) error + Commit(keys []string) error // Close closes the deduplication service Close() diff --git a/services/dedup/dedup_test.go b/services/dedup/dedup_test.go index b113a2fbdb..82808c9db7 100644 --- a/services/dedup/dedup_test.go +++ b/services/dedup/dedup_test.go @@ -35,37 +35,37 @@ func Test_Dedup(t *testing.T) { defer d.Close() t.Run("if message id is not present in cache and badger db", func(t *testing.T) { - found, _, err := d.Set(types.KeyValue{Key: "a", Value: 1}) + found, _, err := d.Get(types.KeyValue{Key: "a", Value: 1}) require.Nil(t, err) require.Equal(t, true, found) // Checking it again should give us the previous value from the cache - found, value, err := d.Set(types.KeyValue{Key: "a", Value: 2}) + found, value, err := d.Get(types.KeyValue{Key: "a", Value: 2}) require.Nil(t, err) require.Equal(t, false, found) require.Equal(t, int64(1), value) }) t.Run("if message is committed, previous value should always return", func(t *testing.T) { - found, _, err := d.Set(types.KeyValue{Key: "b", Value: 1}) + found, _, err := d.Get(types.KeyValue{Key: "b", Value: 1}) require.Nil(t, err) require.Equal(t, true, found) - err = d.Commit(map[string]types.KeyValue{"a": {Key: "a", Value: 1}}) + err = d.Commit([]string{"a"}) require.NoError(t, err) - found, value, err := d.Set(types.KeyValue{Key: "b", Value: 2}) + found, value, err := d.Get(types.KeyValue{Key: "b", Value: 2}) require.Nil(t, err) require.Equal(t, false, found) require.Equal(t, int64(1), value) }) t.Run("committing a messageid not present in cache", func(t *testing.T) { - found, _, err := d.Set(types.KeyValue{Key: "c", Value: 1}) + found, _, err := d.Get(types.KeyValue{Key: "c", Value: 1}) require.Nil(t, err) require.Equal(t, true, found) - err = d.Commit(map[string]types.KeyValue{"d": {Key: "d", Value: 2}}) + err = d.Commit([]string{"d"}) require.NotNil(t, err) }) } @@ -85,21 +85,19 @@ func Test_Dedup_Window(t *testing.T) { require.Nil(t, err) defer d.Close() - found, _, err := d.Set(types.KeyValue{Key: "to be deleted", Value: 1}) + found, _, err := d.Get(types.KeyValue{Key: "to be deleted", Value: 1}) require.Nil(t, err) require.Equal(t, true, found) - key := "to be deleted" - value := types.KeyValue{Key: key, Value: 2} - err = d.Commit(map[string]types.KeyValue{key: value}) + err = d.Commit([]string{"to be deleted"}) require.NoError(t, err) - found, _, err = d.Set(types.KeyValue{Key: "to be deleted", Value: 2}) + found, _, err = d.Get(types.KeyValue{Key: "to be deleted", Value: 2}) require.Nil(t, err) require.Equal(t, false, found) require.Eventually(t, func() bool { - found, _, err = d.Set(types.KeyValue{Key: "to be deleted", Value: 3}) + found, _, err = d.Get(types.KeyValue{Key: "to be deleted", Value: 3}) require.Nil(t, err) return found }, 2*time.Second, 100*time.Millisecond) @@ -112,7 +110,6 @@ func Test_Dedup_ErrTxnTooBig(t *testing.T) { dbPath := os.TempDir() + "/dedup_test_errtxntoobig" defer os.RemoveAll(dbPath) - os.RemoveAll(dbPath) conf := config.New() t.Setenv("RUDDER_TMPDIR", dbPath) d, err := dedup.New(conf, stats.Default) @@ -120,11 +117,11 @@ func Test_Dedup_ErrTxnTooBig(t *testing.T) { defer d.Close() size := 105_000 - messages := map[string]types.KeyValue{} + messages := make([]string, size) for i := 0; i < size; i++ { key := uuid.New().String() - messages[key] = types.KeyValue{Key: key, Value: int64(i + 1)} - _, _, _ = d.Set(messages[key]) + messages[i] = key + _, _, _ = d.Get(types.KeyValue{Key: key, Value: int64(i + 1)}) } err = d.Commit(messages) require.NoError(t, err) @@ -148,22 +145,21 @@ func Benchmark_Dedup(b *testing.B) { batchSize := 1000 msgIDs := make([]types.KeyValue, batchSize) - messages := map[string]types.KeyValue{} - + keys := make([]string, 0) for i := 0; i < b.N; i++ { key := uuid.New().String() msgIDs[i%batchSize] = types.KeyValue{ Key: key, Value: int64(i + 1), } - messages[key] = msgIDs[i%batchSize] + keys = append(keys, key) if i%batchSize == batchSize-1 || i == b.N-1 { for _, msgID := range msgIDs[:i%batchSize] { - _, _, _ = d.Set(msgID) + _, _, _ = d.Get(msgID) } - err := d.Commit(messages) + err := d.Commit(keys) require.NoError(b, err) - messages = nil + keys = nil } } b.ReportMetric(float64(b.N), "events") diff --git a/services/dedup/mirrorBadger/mirrorBadger.go b/services/dedup/mirrorBadger/mirrorBadger.go index b1d7045e5f..f67b5476f9 100644 --- a/services/dedup/mirrorBadger/mirrorBadger.go +++ b/services/dedup/mirrorBadger/mirrorBadger.go @@ -30,11 +30,11 @@ func (mb *MirrorBadger) Close() { mb.badger.Close() } -func (mb *MirrorBadger) Set(kv types.KeyValue) (bool, int64, error) { - return mb.badger.Set(kv) +func (mb *MirrorBadger) Get(kv types.KeyValue) (bool, int64, error) { + return mb.badger.Get(kv) } -func (mb *MirrorBadger) Commit(keys map[string]types.KeyValue) error { +func (mb *MirrorBadger) Commit(keys []string) error { _ = mb.scylla.Commit(keys) return mb.badger.Commit(keys) } diff --git a/services/dedup/mirrorScylla/mirrorScylla.go b/services/dedup/mirrorScylla/mirrorScylla.go index 3a6b9e7062..f7117d4518 100644 --- a/services/dedup/mirrorScylla/mirrorScylla.go +++ b/services/dedup/mirrorScylla/mirrorScylla.go @@ -30,11 +30,11 @@ func (ms *MirrorScylla) Close() { ms.badger.Close() } -func (ms *MirrorScylla) Set(kv types.KeyValue) (bool, int64, error) { - return ms.scylla.Set(kv) +func (ms *MirrorScylla) Get(kv types.KeyValue) (bool, int64, error) { + return ms.scylla.Get(kv) } -func (ms *MirrorScylla) Commit(keys map[string]types.KeyValue) error { +func (ms *MirrorScylla) Commit(keys []string) error { _ = ms.badger.Commit(keys) return ms.scylla.Commit(keys) } diff --git a/services/dedup/scylla/scylla.go b/services/dedup/scylla/scylla.go index 7671b08c15..2f5ae76ecb 100644 --- a/services/dedup/scylla/scylla.go +++ b/services/dedup/scylla/scylla.go @@ -2,6 +2,7 @@ package scylla import ( "fmt" + "sync" "time" "github.com/gocql/gocql" @@ -13,19 +14,24 @@ import ( ) type ScyllaDB struct { - scylla *gocql.Session - conf *config.Config - stat stats.Stats - ttl int + scylla *gocql.Session + conf *config.Config + stat stats.Stats + // Time to live in seconds for an entry in the DB + ttl int + // Maximum number of keys to commit in a single batch batchSize int createTableMap map[string]struct{} + + cacheMu sync.Mutex + cache map[string]types.KeyValue } func (d *ScyllaDB) Close() { d.scylla.Close() } -func (d *ScyllaDB) Set(kv types.KeyValue) (bool, int64, error) { +func (d *ScyllaDB) Get(kv types.KeyValue) (bool, int64, error) { // Create the table if it doesn't exist if _, ok := d.createTableMap[kv.WorkspaceId]; !ok { err := d.scylla.Query(fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s (id text PRIMARY KEY, value bigint) WITH bloom_filter_fp_chance = 0.005", kv.WorkspaceId)).Exec() @@ -35,24 +41,41 @@ func (d *ScyllaDB) Set(kv types.KeyValue) (bool, int64, error) { d.createTableMap[kv.WorkspaceId] = struct{}{} } + d.cacheMu.Lock() + defer d.cacheMu.Unlock() + // Check if the key exists in the cache + // This is essential if we get the same key multiple times in the same batch + // Since we are not committing the keys immediately, we need to keep track of the keys in the cache + if previous, found := d.cache[kv.Key]; found { + return false, previous.Value, nil + } + // Check if the key exists in the DB var value int64 err := d.scylla.Query(fmt.Sprintf("SELECT value FROM %s WHERE id = ?", kv.WorkspaceId), kv.Key).Scan(&value) if err != nil && err != gocql.ErrNotFound { return false, 0, err } - if err == gocql.ErrNotFound { - return true, 0, nil - } + d.cache[kv.Key] = kv return false, kv.Value, nil } -func (d *ScyllaDB) Commit(keys map[string]types.KeyValue) error { - keysList := lo.PartitionBy(lo.Values(keys), func(kv types.KeyValue) string { +func (d *ScyllaDB) Commit(keys []string) error { + d.cacheMu.Lock() + defer d.cacheMu.Unlock() + kvs := make([]types.KeyValue, len(keys)) + for i, key := range keys { + value, ok := d.cache[key] + if !ok { + return fmt.Errorf("key %v has not been previously set", key) + } + kvs[i] = types.KeyValue{Key: key, Value: value.Value, WorkspaceId: value.WorkspaceId} + } + keysList := lo.PartitionBy(kvs, func(kv types.KeyValue) string { return kv.WorkspaceId }) - for _, keys := range keysList { - batches := lo.Chunk(keys, d.batchSize) + for _, keysPerWorkspace := range keysList { + batches := lo.Chunk(keysPerWorkspace, d.batchSize) for _, batch := range batches { scyllaBatch := d.scylla.NewBatch(gocql.LoggedBatch) for _, key := range batch { @@ -64,6 +87,9 @@ func (d *ScyllaDB) Commit(keys map[string]types.KeyValue) error { if err := d.scylla.ExecuteBatch(scyllaBatch); err != nil { return err } + for _, key := range batch { + delete(d.cache, key.Key) + } } } return nil From 6f24adba823258b635ea0bb38c6ad879b6229ba7 Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Tue, 30 Jul 2024 12:11:01 +0530 Subject: [PATCH 08/16] chore: unit tests --- go.mod | 5 +- go.sum | 4 +- services/dedup/badger/badger.go | 22 +++-- services/dedup/dedup.go | 2 +- services/dedup/dedup_test.go | 141 ++++++++++++++++++++------------ services/dedup/scylla/scylla.go | 46 ++++++----- 6 files changed, 131 insertions(+), 89 deletions(-) diff --git a/go.mod b/go.mod index 0141a253fd..f3d2ff1ce8 100644 --- a/go.mod +++ b/go.mod @@ -78,7 +78,7 @@ require ( github.com/rudderlabs/analytics-go v3.3.3+incompatible github.com/rudderlabs/bing-ads-go-sdk v0.2.3 github.com/rudderlabs/compose-test v0.1.3 - github.com/rudderlabs/rudder-go-kit v0.35.1 + github.com/rudderlabs/rudder-go-kit v0.36.1 github.com/rudderlabs/rudder-observability-kit v0.0.3 github.com/rudderlabs/rudder-schemas v0.5.0 github.com/rudderlabs/sql-tunnels v0.1.7 @@ -115,8 +115,6 @@ require ( google.golang.org/protobuf v1.34.2 ) -require github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed // indirect - require ( cloud.google.com/go v0.115.0 // indirect cloud.google.com/go/auth v0.7.2 // indirect @@ -226,6 +224,7 @@ require ( github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect github.com/googleapis/gax-go/v2 v2.12.5 // indirect github.com/gsterjov/go-libsecret v0.0.0-20161001094733-a6f4afe4910c // indirect + github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed // indirect github.com/hamba/avro/v2 v2.22.2-0.20240625062549-66aad10411d9 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect diff --git a/go.sum b/go.sum index ac949f3e38..3c1f10a187 100644 --- a/go.sum +++ b/go.sum @@ -1133,8 +1133,8 @@ github.com/rudderlabs/compose-test v0.1.3 h1:uyep6jDCIF737sfv4zIaMsKRQKX95IDz5Xb github.com/rudderlabs/compose-test v0.1.3/go.mod h1:tuvS1eQdSfwOYv1qwyVAcpdJxPLQXJgy5xGDd/9XmMg= github.com/rudderlabs/parquet-go v0.0.2 h1:ZXRdZdimB0PdJtmxeSSxfI0fDQ3kZjwzBxRi6Ut1J8k= github.com/rudderlabs/parquet-go v0.0.2/go.mod h1:g6guum7o8uhj/uNhunnt7bw5Vabu/goI5i21/3fnxWQ= -github.com/rudderlabs/rudder-go-kit v0.35.1 h1:BJRfo9xfly265pKO8WXaMAPAUbl4gtQPHod4lBvzRH8= -github.com/rudderlabs/rudder-go-kit v0.35.1/go.mod h1:B8jlxe/kaQ009iQnw2Ba8ZKmeXwwREZTRpsE9Nhq+qU= +github.com/rudderlabs/rudder-go-kit v0.36.1 h1:6yP1gVAhfPiIvNqJRSt8I7lOzNJCBw8a7BSjwpbcmUM= +github.com/rudderlabs/rudder-go-kit v0.36.1/go.mod h1:ujLsnvECNg013S6Wv8r24vyN/hkhv6De1ydugnPKy/o= github.com/rudderlabs/rudder-observability-kit v0.0.3 h1:vZtuZRkGX+6rjaeKtxxFE2YYP6QlmAcVcgecTOjvz+Q= github.com/rudderlabs/rudder-observability-kit v0.0.3/go.mod h1:6UjAh3H6rkE0fFLh7z8ZGQEQbKtUkRfhWOf/OUhfqW8= github.com/rudderlabs/rudder-schemas v0.5.0 h1:+140Amou92sB4ZXwsX3jgRIdjNfERJJgYOx0bcUkxzg= diff --git a/services/dedup/badger/badger.go b/services/dedup/badger/badger.go index a61dd9fe72..6923dd976d 100644 --- a/services/dedup/badger/badger.go +++ b/services/dedup/badger/badger.go @@ -75,12 +75,7 @@ func NewBadgerDB(conf *config.Config, stats stats.Stats, path string) *Dedup { func (d *BadgerDB) Get(key string) (int64, bool, error) { var payloadSize int64 var found bool - var err error - err = d.init() - if err != nil { - return 0, false, err - } - err = d.badgerDB.View(func(txn *badger.Txn) error { + err := d.badgerDB.View(func(txn *badger.Txn) error { item, err := txn.Get([]byte(key)) if err != nil { return err @@ -98,10 +93,6 @@ func (d *BadgerDB) Get(key string) (int64, bool, error) { } func (d *BadgerDB) Set(kvs []types.KeyValue) error { - err := d.init() - if err != nil { - return err - } txn := d.badgerDB.NewTransaction(true) for _, message := range kvs { value := strconv.FormatInt(message.Value, 10) @@ -169,7 +160,6 @@ func (d *BadgerDB) gcLoop() { d.stats.NewTaggedStat("badger_db_size", stats.GaugeType, stats.Tags{"name": statName, "type": "lsm"}).Gauge(lsmSize) d.stats.NewTaggedStat("badger_db_size", stats.GaugeType, stats.Tags{"name": statName, "type": "vlog"}).Gauge(vlogSize) d.stats.NewTaggedStat("badger_db_size", stats.GaugeType, stats.Tags{"name": statName, "type": "total"}).Gauge(totSize) - } } @@ -182,6 +172,10 @@ type Dedup struct { func (d *Dedup) Get(kv types.KeyValue) (bool, int64, error) { d.cacheMu.Lock() defer d.cacheMu.Unlock() + err := d.badgerDB.init() + if err != nil { + return false, 0, err + } if previous, found := d.cache[kv.Key]; found { return false, previous, nil } @@ -198,6 +192,10 @@ func (d *Dedup) Get(kv types.KeyValue) (bool, int64, error) { func (d *Dedup) Commit(keys []string) error { d.cacheMu.Lock() defer d.cacheMu.Unlock() + err := d.badgerDB.init() + if err != nil { + return err + } kvs := make([]types.KeyValue, len(keys)) for i, key := range keys { value, ok := d.cache[key] @@ -207,7 +205,7 @@ func (d *Dedup) Commit(keys []string) error { kvs[i] = types.KeyValue{Key: key, Value: value} } - err := d.badgerDB.Set(kvs) + err = d.badgerDB.Set(kvs) if err == nil { for _, kv := range kvs { delete(d.cache, kv.Key) diff --git a/services/dedup/dedup.go b/services/dedup/dedup.go index c602d471ab..4805ab11d6 100644 --- a/services/dedup/dedup.go +++ b/services/dedup/dedup.go @@ -23,7 +23,7 @@ const ( // New creates a new deduplication service. The service needs to be closed after use. func New(conf *config.Config, stats stats.Stats) (Dedup, error) { - mode := Mode(config.GetString("Dedup.Mode", string(Badger))) + mode := Mode(conf.GetString("Dedup.Mode", string(Badger))) switch mode { case Badger: return badger.NewBadgerDB(conf, stats, badger.DefaultPath()), nil diff --git a/services/dedup/dedup_test.go b/services/dedup/dedup_test.go index 82808c9db7..c6a8aee66b 100644 --- a/services/dedup/dedup_test.go +++ b/services/dedup/dedup_test.go @@ -4,15 +4,19 @@ import ( "os" "os/exec" "path" + "strings" "testing" "time" + "github.com/ory/dockertest/v3" + "github.com/google/uuid" "github.com/stretchr/testify/require" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource/scylla" "github.com/rudderlabs/rudder-go-kit/testhelper/rand" "github.com/rudderlabs/rudder-server/services/dedup" @@ -21,53 +25,85 @@ import ( ) func Test_Dedup(t *testing.T) { - config.Reset() - logger.Reset() - misc.Init() - - dbPath := os.TempDir() + "/dedup_test" - defer func() { _ = os.RemoveAll(dbPath) }() - _ = os.RemoveAll(dbPath) - conf := config.New() - t.Setenv("RUDDER_TMPDIR", dbPath) - d, err := dedup.New(conf, stats.Default) - require.Nil(t, err) - defer d.Close() - - t.Run("if message id is not present in cache and badger db", func(t *testing.T) { - found, _, err := d.Get(types.KeyValue{Key: "a", Value: 1}) - require.Nil(t, err) - require.Equal(t, true, found) - - // Checking it again should give us the previous value from the cache - found, value, err := d.Get(types.KeyValue{Key: "a", Value: 2}) - require.Nil(t, err) - require.Equal(t, false, found) - require.Equal(t, int64(1), value) - }) - - t.Run("if message is committed, previous value should always return", func(t *testing.T) { - found, _, err := d.Get(types.KeyValue{Key: "b", Value: 1}) - require.Nil(t, err) - require.Equal(t, true, found) - - err = d.Commit([]string{"a"}) - require.NoError(t, err) - - found, value, err := d.Get(types.KeyValue{Key: "b", Value: 2}) - require.Nil(t, err) - require.Equal(t, false, found) - require.Equal(t, int64(1), value) - }) - - t.Run("committing a messageid not present in cache", func(t *testing.T) { - found, _, err := d.Get(types.KeyValue{Key: "c", Value: 1}) - require.Nil(t, err) - require.Equal(t, true, found) + testCases := []struct { + name string + }{ + { + name: "Badger", + }, + { + name: "Scylla", + }, + { + name: "MirrorScylla", + }, + { + name: "MirrorBadger", + }, + { + name: "Random", + }, + } - err = d.Commit([]string{"d"}) - require.NotNil(t, err) - }) + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + config.Reset() + logger.Reset() + misc.Init() + + dbPath := os.TempDir() + "/dedup_test" + defer func() { _ = os.RemoveAll(dbPath) }() + _ = os.RemoveAll(dbPath) + conf := config.New() + t.Setenv("RUDDER_TMPDIR", dbPath) + pool, err := dockertest.NewPool("") + require.NoError(t, err) + keySpace := strings.ToUpper(rand.String(5)) + resource, err := scylla.Setup(pool, t, scylla.WithKeyspace(keySpace)) + require.NoError(t, err) + conf.Set("Scylla.Hosts", resource.URL) + conf.Set("Scylla.Keyspace", keySpace) + conf.Set("Dedup.Mode", tc.name) + d, err := dedup.New(conf, stats.Default) + require.Nil(t, err) + defer d.Close() + + t.Run("if message id is not present in cache and badger db", func(t *testing.T) { + found, _, err := d.Get(types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"}) + require.Nil(t, err) + require.Equal(t, true, found) + + // Checking it again should give us the previous value from the cache + found, value, err := d.Get(types.KeyValue{Key: "a", Value: 2, WorkspaceId: "test"}) + require.Nil(t, err) + require.Equal(t, false, found) + require.Equal(t, int64(1), value) + }) + + t.Run("if message is committed, previous value should always return", func(t *testing.T) { + found, _, err := d.Get(types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"}) + require.Nil(t, err) + require.Equal(t, true, found) + + err = d.Commit([]string{"a"}) + require.NoError(t, err) + + found, value, err := d.Get(types.KeyValue{Key: "b", Value: 2, WorkspaceId: "test"}) + require.Nil(t, err) + require.Equal(t, false, found) + require.Equal(t, int64(1), value) + }) + + t.Run("committing a messageid not present in cache", func(t *testing.T) { + found, _, err := d.Get(types.KeyValue{Key: "c", Value: 1, WorkspaceId: "test"}) + require.Nil(t, err) + require.Equal(t, true, found) + + err = d.Commit([]string{"d"}) + require.NotNil(t, err) + }) + }) + } } func Test_Dedup_Window(t *testing.T) { @@ -85,19 +121,19 @@ func Test_Dedup_Window(t *testing.T) { require.Nil(t, err) defer d.Close() - found, _, err := d.Get(types.KeyValue{Key: "to be deleted", Value: 1}) + found, _, err := d.Get(types.KeyValue{Key: "to be deleted", Value: 1, WorkspaceId: "test"}) require.Nil(t, err) require.Equal(t, true, found) err = d.Commit([]string{"to be deleted"}) require.NoError(t, err) - found, _, err = d.Get(types.KeyValue{Key: "to be deleted", Value: 2}) + found, _, err = d.Get(types.KeyValue{Key: "to be deleted", Value: 2, WorkspaceId: "test"}) require.Nil(t, err) require.Equal(t, false, found) require.Eventually(t, func() bool { - found, _, err = d.Get(types.KeyValue{Key: "to be deleted", Value: 3}) + found, _, err = d.Get(types.KeyValue{Key: "to be deleted", Value: 3, WorkspaceId: "test"}) require.Nil(t, err) return found }, 2*time.Second, 100*time.Millisecond) @@ -121,7 +157,7 @@ func Test_Dedup_ErrTxnTooBig(t *testing.T) { for i := 0; i < size; i++ { key := uuid.New().String() messages[i] = key - _, _, _ = d.Get(types.KeyValue{Key: key, Value: int64(i + 1)}) + _, _, _ = d.Get(types.KeyValue{Key: key, Value: int64(i + 1), WorkspaceId: "test"}) } err = d.Commit(messages) require.NoError(t, err) @@ -149,8 +185,9 @@ func Benchmark_Dedup(b *testing.B) { for i := 0; i < b.N; i++ { key := uuid.New().String() msgIDs[i%batchSize] = types.KeyValue{ - Key: key, - Value: int64(i + 1), + Key: key, + Value: int64(i + 1), + WorkspaceId: "test", } keys = append(keys, key) if i%batchSize == batchSize-1 || i == b.N-1 { diff --git a/services/dedup/scylla/scylla.go b/services/dedup/scylla/scylla.go index 2f5ae76ecb..b6bcc064fd 100644 --- a/services/dedup/scylla/scylla.go +++ b/services/dedup/scylla/scylla.go @@ -23,8 +23,9 @@ type ScyllaDB struct { batchSize int createTableMap map[string]struct{} - cacheMu sync.Mutex - cache map[string]types.KeyValue + cacheMu sync.Mutex + cache map[string]types.KeyValue + keyspace string } func (d *ScyllaDB) Close() { @@ -34,9 +35,10 @@ func (d *ScyllaDB) Close() { func (d *ScyllaDB) Get(kv types.KeyValue) (bool, int64, error) { // Create the table if it doesn't exist if _, ok := d.createTableMap[kv.WorkspaceId]; !ok { - err := d.scylla.Query(fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s (id text PRIMARY KEY, value bigint) WITH bloom_filter_fp_chance = 0.005", kv.WorkspaceId)).Exec() + query := fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s.%s (id text PRIMARY KEY,size bigint) WITH bloom_filter_fp_chance = 0.005;", d.keyspace, kv.WorkspaceId) + err := d.scylla.Query(query).Exec() if err != nil { - return false, 0, err + return false, 0, fmt.Errorf("error creating table %s: %v", kv.WorkspaceId, err) } d.createTableMap[kv.WorkspaceId] = struct{}{} } @@ -52,12 +54,15 @@ func (d *ScyllaDB) Get(kv types.KeyValue) (bool, int64, error) { // Check if the key exists in the DB var value int64 - err := d.scylla.Query(fmt.Sprintf("SELECT value FROM %s WHERE id = ?", kv.WorkspaceId), kv.Key).Scan(&value) + err := d.scylla.Query(fmt.Sprintf("SELECT size FROM %s.%s WHERE id = ?", d.keyspace, kv.WorkspaceId), kv.Key).Scan(&value) if err != nil && err != gocql.ErrNotFound { - return false, 0, err + return false, 0, fmt.Errorf("error getting key %s: %v", kv.Key, err) } - d.cache[kv.Key] = kv - return false, kv.Value, nil + found := !(err == gocql.ErrNotFound) + if !found { + d.cache[kv.Key] = kv + } + return !found, kv.Value, nil } func (d *ScyllaDB) Commit(keys []string) error { @@ -80,12 +85,12 @@ func (d *ScyllaDB) Commit(keys []string) error { scyllaBatch := d.scylla.NewBatch(gocql.LoggedBatch) for _, key := range batch { scyllaBatch.Entries = append(scyllaBatch.Entries, gocql.BatchEntry{ - Stmt: fmt.Sprintf("INSERT INTO %s (id) VALUES (?,?) USING TTL %d", key.WorkspaceId, d.ttl), + Stmt: fmt.Sprintf("INSERT INTO %s.%s (id,size) VALUES (?,?) USING TTL %d", d.keyspace, key.WorkspaceId, d.ttl), Args: []interface{}{key.Key, key.Value}, }) } if err := d.scylla.ExecuteBatch(scyllaBatch); err != nil { - return err + return fmt.Errorf("error committing keys: %v", err) } for _, key := range batch { delete(d.cache, key.Key) @@ -98,9 +103,8 @@ func (d *ScyllaDB) Commit(keys []string) error { func New(conf *config.Config, stats stats.Stats) (*ScyllaDB, error) { cluster := gocql.NewCluster(conf.GetString("Scylla.Hosts", "localhost:9042")) cluster.Consistency = gocql.Quorum - cluster.Keyspace = config.GetString("Scylla.Keyspace", "dedup") cluster.RetryPolicy = &gocql.ExponentialBackoffRetryPolicy{ - NumRetries: config.GetInt("Scylla.NumRetries", 3), + NumRetries: conf.GetInt("Scylla.NumRetries", 3), Min: conf.GetDuration("Scylla.MinRetry", 100, time.Millisecond), Max: conf.GetDuration("Scylla.MaxRetry", 2000, time.Second), } @@ -109,11 +113,15 @@ func New(conf *config.Config, stats stats.Stats) (*ScyllaDB, error) { if err != nil { return nil, err } - return &ScyllaDB{ - scylla: session, - conf: conf, - stat: stats, - ttl: conf.GetInt("Scylla.TTL", 1209600), // TTL is defaulted to seconds - batchSize: conf.GetInt("Scylla.BatchSize", 100), - }, nil + scylla := &ScyllaDB{ + scylla: session, + conf: conf, + keyspace: conf.GetString("Scylla.Keyspace", "rudder"), + stat: stats, + ttl: conf.GetInt("Scylla.TTL", 1209600), // TTL is defaulted to seconds + batchSize: conf.GetInt("Scylla.BatchSize", 100), + cache: make(map[string]types.KeyValue), + createTableMap: make(map[string]struct{}), + } + return scylla, nil } From d1470a71c915638ac9648d431d45f6596009f158 Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Fri, 2 Aug 2024 11:43:20 +0530 Subject: [PATCH 09/16] chore: debug test --- services/dedup/dedup_test.go | 1 + services/dedup/scylla/scylla.go | 1 + 2 files changed, 2 insertions(+) diff --git a/services/dedup/dedup_test.go b/services/dedup/dedup_test.go index c6a8aee66b..7f57810b51 100644 --- a/services/dedup/dedup_test.go +++ b/services/dedup/dedup_test.go @@ -64,6 +64,7 @@ func Test_Dedup(t *testing.T) { conf.Set("Scylla.Hosts", resource.URL) conf.Set("Scylla.Keyspace", keySpace) conf.Set("Dedup.Mode", tc.name) + conf.Set("Scylla.DisableInitialHostLookup", "true") d, err := dedup.New(conf, stats.Default) require.Nil(t, err) defer d.Close() diff --git a/services/dedup/scylla/scylla.go b/services/dedup/scylla/scylla.go index b6bcc064fd..0dea313f1f 100644 --- a/services/dedup/scylla/scylla.go +++ b/services/dedup/scylla/scylla.go @@ -109,6 +109,7 @@ func New(conf *config.Config, stats stats.Stats) (*ScyllaDB, error) { Max: conf.GetDuration("Scylla.MaxRetry", 2000, time.Second), } cluster.Timeout = conf.GetDuration("Scylla.Timeout", 10, time.Second) + cluster.DisableInitialHostLookup = conf.GetBool("Scylla.DisableInitialHostLookup", false) session, err := cluster.CreateSession() if err != nil { return nil, err From 41015c6fed307fcf3e11eac5fee04057e582428b Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Mon, 5 Aug 2024 12:30:27 +0530 Subject: [PATCH 10/16] chore: run on ubuntu latest --- .github/workflows/tests.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 26a7ad97e0..5e8b5ec1bd 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -12,7 +12,7 @@ concurrency: jobs: integration: name: Integration - runs-on: 'ubuntu-20.04' + runs-on: ubuntu-latest strategy: matrix: FEATURES: [ oss ,enterprise ] @@ -42,7 +42,7 @@ jobs: RSERVER_ENABLE_MULTITENANCY: false warehouse-integration: name: Warehouse Integration - runs-on: 'ubuntu-20.04' + runs-on: ubuntu-latest strategy: fail-fast: false matrix: @@ -107,7 +107,7 @@ jobs: path: coverage.txt unit: name: Unit - runs-on: 'ubuntu-20.04' + runs-on: ubuntu-latest steps: - name: Disable IPv6 (temporary fix) run: | @@ -127,7 +127,7 @@ jobs: path: coverage.txt package-unit: name: Package Unit - runs-on: 'ubuntu-20.04' + runs-on: ubuntu-latest strategy: fail-fast: false matrix: @@ -186,7 +186,7 @@ jobs: path: coverage.txt coverage: name: Coverage - runs-on: 'ubuntu-20.04' + runs-on: ubuntu-latest needs: - warehouse-integration - unit From 8a1aca735ebb010af053e063541be982db68ed6d Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Mon, 5 Aug 2024 14:33:38 +0530 Subject: [PATCH 11/16] chore: disable ipv6 --- .github/workflows/tests.yaml | 16 ---------------- services/dedup/scylla/scylla.go | 32 ++++++++++++++++++++------------ 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 5e8b5ec1bd..02d424b70c 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -17,10 +17,6 @@ jobs: matrix: FEATURES: [ oss ,enterprise ] steps: - - name: Disable IPv6 (temporary fix) - run: | - sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 - sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 - name: Checkout uses: actions/checkout@v4 - uses: actions/setup-go@v5 @@ -67,10 +63,6 @@ jobs: - package: warehouse/integrations/snowflake destination: snowflake steps: - - name: Disable IPv6 (temporary fix) - run: | - sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 - sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 - name: Checkout uses: actions/checkout@v4 - name: Setup Go @@ -109,10 +101,6 @@ jobs: name: Unit runs-on: ubuntu-latest steps: - - name: Disable IPv6 (temporary fix) - run: | - sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 - sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: @@ -154,10 +142,6 @@ jobs: exclude: services/rsources steps: - - name: Disable IPv6 (temporary fix) - run: | - sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 - sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 - uses: actions/checkout@v4 - uses: actions/setup-go@v5 with: diff --git a/services/dedup/scylla/scylla.go b/services/dedup/scylla/scylla.go index b6bcc064fd..d6a412edb0 100644 --- a/services/dedup/scylla/scylla.go +++ b/services/dedup/scylla/scylla.go @@ -21,7 +21,8 @@ type ScyllaDB struct { ttl int // Maximum number of keys to commit in a single batch batchSize int - createTableMap map[string]struct{} + createTableMu sync.Mutex + createTableMap map[string]*sync.Once cacheMu sync.Mutex cache map[string]types.KeyValue @@ -34,13 +35,20 @@ func (d *ScyllaDB) Close() { func (d *ScyllaDB) Get(kv types.KeyValue) (bool, int64, error) { // Create the table if it doesn't exist - if _, ok := d.createTableMap[kv.WorkspaceId]; !ok { + var err error + d.createTableMu.Lock() + once, found := d.createTableMap[kv.WorkspaceId] + if !found { + d.createTableMap[kv.WorkspaceId] = &sync.Once{} + once = d.createTableMap[kv.WorkspaceId] + } + d.createTableMu.Unlock() + once.Do(func() { query := fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s.%s (id text PRIMARY KEY,size bigint) WITH bloom_filter_fp_chance = 0.005;", d.keyspace, kv.WorkspaceId) - err := d.scylla.Query(query).Exec() - if err != nil { - return false, 0, fmt.Errorf("error creating table %s: %v", kv.WorkspaceId, err) - } - d.createTableMap[kv.WorkspaceId] = struct{}{} + err = d.scylla.Query(query).Exec() + }) + if err != nil { + return false, 0, fmt.Errorf("error creating table %s: %v", kv.WorkspaceId, err) } d.cacheMu.Lock() @@ -54,15 +62,15 @@ func (d *ScyllaDB) Get(kv types.KeyValue) (bool, int64, error) { // Check if the key exists in the DB var value int64 - err := d.scylla.Query(fmt.Sprintf("SELECT size FROM %s.%s WHERE id = ?", d.keyspace, kv.WorkspaceId), kv.Key).Scan(&value) + err = d.scylla.Query(fmt.Sprintf("SELECT size FROM %s.%s WHERE id = ?", d.keyspace, kv.WorkspaceId), kv.Key).Scan(&value) if err != nil && err != gocql.ErrNotFound { return false, 0, fmt.Errorf("error getting key %s: %v", kv.Key, err) } - found := !(err == gocql.ErrNotFound) - if !found { + exists := !(err == gocql.ErrNotFound) + if !exists { d.cache[kv.Key] = kv } - return !found, kv.Value, nil + return !exists, kv.Value, nil } func (d *ScyllaDB) Commit(keys []string) error { @@ -121,7 +129,7 @@ func New(conf *config.Config, stats stats.Stats) (*ScyllaDB, error) { ttl: conf.GetInt("Scylla.TTL", 1209600), // TTL is defaulted to seconds batchSize: conf.GetInt("Scylla.BatchSize", 100), cache: make(map[string]types.KeyValue), - createTableMap: make(map[string]struct{}), + createTableMap: make(map[string]*sync.Once), } return scylla, nil } From 1933c5691cb13f23d0fcec78048ef040ba14411d Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Mon, 5 Aug 2024 20:02:31 +0530 Subject: [PATCH 12/16] chore: add more tests --- services/dedup/badger/badger_test.go | 51 ++++++++++++ .../dedup/mirrorBadger/mirrorBadger_test.go | 63 +++++++++++++++ .../dedup/mirrorScylla/mirrorScylla_test.go | 77 +++++++++++++++++++ services/dedup/scylla/scylla_test.go | 66 ++++++++++++++++ 4 files changed, 257 insertions(+) create mode 100644 services/dedup/badger/badger_test.go create mode 100644 services/dedup/mirrorBadger/mirrorBadger_test.go create mode 100644 services/dedup/mirrorScylla/mirrorScylla_test.go create mode 100644 services/dedup/scylla/scylla_test.go diff --git a/services/dedup/badger/badger_test.go b/services/dedup/badger/badger_test.go new file mode 100644 index 0000000000..05d7521a8b --- /dev/null +++ b/services/dedup/badger/badger_test.go @@ -0,0 +1,51 @@ +package badger + +import ( + "os" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-server/services/dedup/types" + "github.com/rudderlabs/rudder-server/utils/misc" +) + +func Test_Badger(t *testing.T) { + config.Reset() + logger.Reset() + misc.Init() + + dbPath := os.TempDir() + "/dedup_test" + defer func() { _ = os.RemoveAll(dbPath) }() + _ = os.RemoveAll(dbPath) + conf := config.New() + t.Setenv("RUDDER_TMPDIR", dbPath) + badger := NewBadgerDB(conf, stats.NOP, DefaultPath()) + require.NotNil(t, badger) + defer badger.Close() + t.Run("Same messageID should be deduped from badger", func(t *testing.T) { + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + found, _, err := badger.Get(key1) + require.Nil(t, err) + require.True(t, found) + err = badger.Commit([]string{key1.Key}) + require.NoError(t, err) + found, _, err = badger.Get(key2) + require.Nil(t, err) + require.False(t, found) + }) + t.Run("Same messageID should be deduped from cache", func(t *testing.T) { + key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + found, _, err := badger.Get(key1) + require.Nil(t, err) + require.True(t, found) + found, _, err = badger.Get(key2) + require.Nil(t, err) + require.False(t, found) + }) +} diff --git a/services/dedup/mirrorBadger/mirrorBadger_test.go b/services/dedup/mirrorBadger/mirrorBadger_test.go new file mode 100644 index 0000000000..0159f230ec --- /dev/null +++ b/services/dedup/mirrorBadger/mirrorBadger_test.go @@ -0,0 +1,63 @@ +package mirrorBadger + +import ( + "os" + "strings" + "testing" + + "github.com/ory/dockertest/v3" + "github.com/stretchr/testify/require" + + "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource/scylla" + "github.com/rudderlabs/rudder-go-kit/testhelper/rand" + "github.com/rudderlabs/rudder-server/services/dedup/types" + "github.com/rudderlabs/rudder-server/utils/misc" +) + +func Test_MirrorBadger(t *testing.T) { + config.Reset() + logger.Reset() + misc.Init() + + dbPath := os.TempDir() + "/dedup_test" + defer func() { _ = os.RemoveAll(dbPath) }() + _ = os.RemoveAll(dbPath) + conf := config.New() + t.Setenv("RUDDER_TMPDIR", dbPath) + pool, err := dockertest.NewPool("") + require.NoError(t, err) + keySpace := strings.ToUpper(rand.String(5)) + resource, err := scylla.Setup(pool, t, scylla.WithKeyspace(keySpace)) + require.NoError(t, err) + conf.Set("Scylla.Hosts", resource.URL) + conf.Set("Scylla.Keyspace", keySpace) + mirrorBadger, err := NewMirrorBadger(conf, stats.NOP) + require.Nil(t, err) + require.NotNil(t, mirrorBadger) + defer mirrorBadger.Close() + t.Run("Same messageID should be deduped from badger", func(t *testing.T) { + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + found, _, err := mirrorBadger.Get(key1) + require.Nil(t, err) + require.True(t, found) + err = mirrorBadger.Commit([]string{key1.Key}) + require.NoError(t, err) + found, _, err = mirrorBadger.Get(key2) + require.Nil(t, err) + require.False(t, found) + }) + t.Run("Same messageID should be deduped from cache", func(t *testing.T) { + key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + found, _, err := mirrorBadger.Get(key1) + require.Nil(t, err) + require.True(t, found) + found, _, err = mirrorBadger.Get(key2) + require.Nil(t, err) + require.False(t, found) + }) +} diff --git a/services/dedup/mirrorScylla/mirrorScylla_test.go b/services/dedup/mirrorScylla/mirrorScylla_test.go new file mode 100644 index 0000000000..57b041429c --- /dev/null +++ b/services/dedup/mirrorScylla/mirrorScylla_test.go @@ -0,0 +1,77 @@ +package mirrorScylla + +import ( + "os" + "strings" + "testing" + + "github.com/ory/dockertest/v3" + "github.com/stretchr/testify/require" + + "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource/scylla" + "github.com/rudderlabs/rudder-go-kit/testhelper/rand" + "github.com/rudderlabs/rudder-server/services/dedup/types" + "github.com/rudderlabs/rudder-server/utils/misc" +) + +func Test_MirrorBadger(t *testing.T) { + config.Reset() + logger.Reset() + misc.Init() + + dbPath := os.TempDir() + "/dedup_test" + defer func() { _ = os.RemoveAll(dbPath) }() + _ = os.RemoveAll(dbPath) + conf := config.New() + t.Setenv("RUDDER_TMPDIR", dbPath) + pool, err := dockertest.NewPool("") + require.NoError(t, err) + keySpace := strings.ToUpper(rand.String(5)) + resource, err := scylla.Setup(pool, t, scylla.WithKeyspace(keySpace)) + require.NoError(t, err) + conf.Set("Scylla.Hosts", resource.URL) + conf.Set("Scylla.Keyspace", keySpace) + mirrorScylla, err := NewMirrorScylla(conf, stats.NOP) + require.Nil(t, err) + require.NotNil(t, mirrorScylla) + defer mirrorScylla.Close() + t.Run("Same messageID should not be deduped for different workspace", func(t *testing.T) { + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test1"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test2"} + found, _, err := mirrorScylla.Get(key1) + require.Nil(t, err) + require.True(t, found) + err = mirrorScylla.Commit([]string{key1.Key}) + require.NoError(t, err) + found, _, err = mirrorScylla.Get(key2) + require.Nil(t, err) + require.True(t, found) + err = mirrorScylla.Commit([]string{key2.Key}) + require.NoError(t, err) + }) + t.Run("Same messageID should be deduped for same workspace", func(t *testing.T) { + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + found, _, err := mirrorScylla.Get(key1) + require.Nil(t, err) + require.True(t, found) + err = mirrorScylla.Commit([]string{key1.Key}) + require.NoError(t, err) + found, _, err = mirrorScylla.Get(key2) + require.Nil(t, err) + require.False(t, found) + }) + t.Run("Same messageID should be deduped for same workspace from cache", func(t *testing.T) { + key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + found, _, err := mirrorScylla.Get(key1) + require.Nil(t, err) + require.True(t, found) + found, _, err = mirrorScylla.Get(key2) + require.Nil(t, err) + require.False(t, found) + }) +} diff --git a/services/dedup/scylla/scylla_test.go b/services/dedup/scylla/scylla_test.go new file mode 100644 index 0000000000..d695a8c366 --- /dev/null +++ b/services/dedup/scylla/scylla_test.go @@ -0,0 +1,66 @@ +package scylla + +import ( + "testing" + + "github.com/ory/dockertest/v3" + "github.com/stretchr/testify/require" + + "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource/scylla" + "github.com/rudderlabs/rudder-go-kit/testhelper/rand" + "github.com/rudderlabs/rudder-server/services/dedup/types" +) + +func Test_Scylla(t *testing.T) { + pool, err := dockertest.NewPool("") + require.NoError(t, err) + keySpace := rand.String(5) + scyllaContainer, err := scylla.Setup(pool, t, scylla.WithKeyspace(keySpace)) + require.NoError(t, err) + require.NotNil(t, scyllaContainer) + conf := config.New() + conf.Set("Scylla.Hosts", scyllaContainer.URL) + conf.Set("Scylla.Keyspace", keySpace) + scylla, err := New(conf, stats.NOP) + require.NoError(t, err) + require.NotNil(t, scylla) + defer scylla.Close() + t.Run("Same messageID should not be deduped for different workspace", func(t *testing.T) { + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test1"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test2"} + found, _, err := scylla.Get(key1) + require.Nil(t, err) + require.True(t, found) + err = scylla.Commit([]string{key1.Key}) + require.NoError(t, err) + found, _, err = scylla.Get(key2) + require.Nil(t, err) + require.True(t, found) + err = scylla.Commit([]string{key2.Key}) + require.NoError(t, err) + }) + t.Run("Same messageID should be deduped for same workspace", func(t *testing.T) { + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + found, _, err := scylla.Get(key1) + require.Nil(t, err) + require.True(t, found) + err = scylla.Commit([]string{key1.Key}) + require.NoError(t, err) + found, _, err = scylla.Get(key2) + require.Nil(t, err) + require.False(t, found) + }) + t.Run("Same messageID should be deduped for same workspace from cache", func(t *testing.T) { + key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + found, _, err := scylla.Get(key1) + require.Nil(t, err) + require.True(t, found) + found, _, err = scylla.Get(key2) + require.Nil(t, err) + require.False(t, found) + }) +} From 69f170e48b7e82de3aa19db6fb27085b36bea110 Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Thu, 8 Aug 2024 15:45:53 +0530 Subject: [PATCH 13/16] chore: review comments --- services/dedup/badger/badger.go | 36 ++++++++++++++++++++++----------- services/dedup/scylla/scylla.go | 8 ++++++-- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/services/dedup/badger/badger.go b/services/dedup/badger/badger.go index 6923dd976d..c776506d91 100644 --- a/services/dedup/badger/badger.go +++ b/services/dedup/badger/badger.go @@ -170,48 +170,60 @@ type Dedup struct { } func (d *Dedup) Get(kv types.KeyValue) (bool, int64, error) { - d.cacheMu.Lock() - defer d.cacheMu.Unlock() err := d.badgerDB.init() if err != nil { return false, 0, err } - if previous, found := d.cache[kv.Key]; found { + + d.cacheMu.Lock() + previous, found := d.cache[kv.Key] + d.cacheMu.Unlock() + if found { return false, previous, nil } - previous, found, err := d.badgerDB.Get(kv.Key) + + previous, found, err = d.badgerDB.Get(kv.Key) if err != nil { return false, 0, err } - if !found { + + d.cacheMu.Lock() + defer d.cacheMu.Unlock() + if !found { // still not in the cache, but it's in the DB so let's refresh the cache d.cache[kv.Key] = kv.Value } + return !found, previous, nil } func (d *Dedup) Commit(keys []string) error { - d.cacheMu.Lock() - defer d.cacheMu.Unlock() err := d.badgerDB.init() if err != nil { return err } kvs := make([]types.KeyValue, len(keys)) + d.cacheMu.Lock() for i, key := range keys { value, ok := d.cache[key] if !ok { + d.cacheMu.Unlock() return fmt.Errorf("key %v has not been previously set", key) } kvs[i] = types.KeyValue{Key: key, Value: value} } + d.cacheMu.Unlock() err = d.badgerDB.Set(kvs) - if err == nil { - for _, kv := range kvs { - delete(d.cache, kv.Key) - } + if err != nil { + return err } - return err + + d.cacheMu.Lock() + defer d.cacheMu.Unlock() + for _, kv := range kvs { + delete(d.cache, kv.Key) + } + return nil } func (d *Dedup) Close() { diff --git a/services/dedup/scylla/scylla.go b/services/dedup/scylla/scylla.go index d6a412edb0..10e80d98c6 100644 --- a/services/dedup/scylla/scylla.go +++ b/services/dedup/scylla/scylla.go @@ -1,6 +1,7 @@ package scylla import ( + "errors" "fmt" "sync" "time" @@ -63,7 +64,7 @@ func (d *ScyllaDB) Get(kv types.KeyValue) (bool, int64, error) { // Check if the key exists in the DB var value int64 err = d.scylla.Query(fmt.Sprintf("SELECT size FROM %s.%s WHERE id = ?", d.keyspace, kv.WorkspaceId), kv.Key).Scan(&value) - if err != nil && err != gocql.ErrNotFound { + if err != nil && !errors.Is(err, gocql.ErrNotFound) { return false, 0, fmt.Errorf("error getting key %s: %v", kv.Key, err) } exists := !(err == gocql.ErrNotFound) @@ -75,15 +76,16 @@ func (d *ScyllaDB) Get(kv types.KeyValue) (bool, int64, error) { func (d *ScyllaDB) Commit(keys []string) error { d.cacheMu.Lock() - defer d.cacheMu.Unlock() kvs := make([]types.KeyValue, len(keys)) for i, key := range keys { value, ok := d.cache[key] if !ok { + d.cacheMu.Unlock() return fmt.Errorf("key %v has not been previously set", key) } kvs[i] = types.KeyValue{Key: key, Value: value.Value, WorkspaceId: value.WorkspaceId} } + d.cacheMu.Unlock() keysList := lo.PartitionBy(kvs, func(kv types.KeyValue) string { return kv.WorkspaceId }) @@ -100,9 +102,11 @@ func (d *ScyllaDB) Commit(keys []string) error { if err := d.scylla.ExecuteBatch(scyllaBatch); err != nil { return fmt.Errorf("error committing keys: %v", err) } + d.cacheMu.Lock() for _, key := range batch { delete(d.cache, key.Key) } + d.cacheMu.Unlock() } } return nil From 4edc80d59cd495d54ca4f73847cf8d9fe5563ea5 Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Fri, 9 Aug 2024 14:06:06 +0530 Subject: [PATCH 14/16] Apply suggestions from code review Co-authored-by: Mihir Gandhi --- services/dedup/badger/badger_test.go | 21 +++++++++------------ services/dedup/dedup_test.go | 6 ++---- services/dedup/mirrorBadger/mirrorBadger.go | 1 + services/dedup/mirrorScylla/mirrorScylla.go | 1 + 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/services/dedup/badger/badger_test.go b/services/dedup/badger/badger_test.go index 05d7521a8b..81271b0cd7 100644 --- a/services/dedup/badger/badger_test.go +++ b/services/dedup/badger/badger_test.go @@ -1,7 +1,6 @@ package badger import ( - "os" "testing" "github.com/stretchr/testify/require" @@ -18,9 +17,7 @@ func Test_Badger(t *testing.T) { logger.Reset() misc.Init() - dbPath := os.TempDir() + "/dedup_test" - defer func() { _ = os.RemoveAll(dbPath) }() - _ = os.RemoveAll(dbPath) + dbPath := t.TempDir() conf := config.New() t.Setenv("RUDDER_TMPDIR", dbPath) badger := NewBadgerDB(conf, stats.NOP, DefaultPath()) @@ -29,23 +26,23 @@ func Test_Badger(t *testing.T) { t.Run("Same messageID should be deduped from badger", func(t *testing.T) { key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} - found, _, err := badger.Get(key1) - require.Nil(t, err) - require.True(t, found) + notAvailable, _, err := badger.Get(key1) + require.NoError(t, err) + require.True(t, notAvailable) err = badger.Commit([]string{key1.Key}) require.NoError(t, err) - found, _, err = badger.Get(key2) - require.Nil(t, err) - require.False(t, found) + notAvailable, _, err = badger.Get(key2) + require.NoError(t, err) + require.False(t, notAvailable) }) t.Run("Same messageID should be deduped from cache", func(t *testing.T) { key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} found, _, err := badger.Get(key1) - require.Nil(t, err) + require.NoError(t, err) require.True(t, found) found, _, err = badger.Get(key2) - require.Nil(t, err) + require.NoError(t, err) require.False(t, found) }) } diff --git a/services/dedup/dedup_test.go b/services/dedup/dedup_test.go index c6a8aee66b..7b5bd79a9b 100644 --- a/services/dedup/dedup_test.go +++ b/services/dedup/dedup_test.go @@ -51,9 +51,7 @@ func Test_Dedup(t *testing.T) { logger.Reset() misc.Init() - dbPath := os.TempDir() + "/dedup_test" - defer func() { _ = os.RemoveAll(dbPath) }() - _ = os.RemoveAll(dbPath) + dbPath := t.TempDir() conf := config.New() t.Setenv("RUDDER_TMPDIR", dbPath) pool, err := dockertest.NewPool("") @@ -70,7 +68,7 @@ func Test_Dedup(t *testing.T) { t.Run("if message id is not present in cache and badger db", func(t *testing.T) { found, _, err := d.Get(types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"}) - require.Nil(t, err) + require.NoError(t, err) require.Equal(t, true, found) // Checking it again should give us the previous value from the cache diff --git a/services/dedup/mirrorBadger/mirrorBadger.go b/services/dedup/mirrorBadger/mirrorBadger.go index f67b5476f9..efcd3a6dcd 100644 --- a/services/dedup/mirrorBadger/mirrorBadger.go +++ b/services/dedup/mirrorBadger/mirrorBadger.go @@ -31,6 +31,7 @@ func (mb *MirrorBadger) Close() { } func (mb *MirrorBadger) Get(kv types.KeyValue) (bool, int64, error) { + _, _, _ = mb.scylla.Get(kv) return mb.badger.Get(kv) } diff --git a/services/dedup/mirrorScylla/mirrorScylla.go b/services/dedup/mirrorScylla/mirrorScylla.go index f7117d4518..751534f44b 100644 --- a/services/dedup/mirrorScylla/mirrorScylla.go +++ b/services/dedup/mirrorScylla/mirrorScylla.go @@ -31,6 +31,7 @@ func (ms *MirrorScylla) Close() { } func (ms *MirrorScylla) Get(kv types.KeyValue) (bool, int64, error) { + _, _, _ = ms.badger.Get(kv) return ms.scylla.Get(kv) } From d44779f4266c1a668f175063f09efa728a71e7f6 Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Mon, 12 Aug 2024 15:16:35 +0530 Subject: [PATCH 15/16] chore: add more tests --- router/network.go | 9 +++------ services/dedup/mirrorBadger/mirrorBadger_test.go | 6 ++++++ services/dedup/mirrorScylla/mirrorScylla_test.go | 6 ++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/router/network.go b/router/network.go index ec90132b97..90fd48ca8c 100644 --- a/router/network.go +++ b/router/network.go @@ -173,10 +173,7 @@ func (network *netHandle) SendPost(ctx context.Context, structData integrations. // support of array type in params is handled if the // response from transformers are "," separated queryParams := req.URL.Query() - for key, val := range requestQueryParams { // list := strings.Split(valString, ",") - // for _, listItem := range list { - // queryParams.Add(key, fmt.Sprint(listItem)) - // } + for key, val := range requestQueryParams { formattedVal := handleQueryParam(val) queryParams.Add(key, formattedVal) } @@ -195,7 +192,7 @@ func (network *netHandle) SendPost(ctx context.Context, structData integrations. if err != nil { return &utils.SendPostResponse{ StatusCode: http.StatusGatewayTimeout, - ResponseBody: []byte(fmt.Sprintf(`504 Unable to make %q request for URL : %q. Error: %s`, requestMethod, postInfo.URL, err.Error())), + ResponseBody: []byte(fmt.Sprintf(`504 Unable to make %q request for URL : %q. Error: %v`, requestMethod, postInfo.URL, err)), } } @@ -205,7 +202,7 @@ func (network *netHandle) SendPost(ctx context.Context, structData integrations. if err != nil { return &utils.SendPostResponse{ StatusCode: resp.StatusCode, - ResponseBody: []byte(fmt.Sprintf(`Failed to read response body for request for URL : %q. Error: %s`, postInfo.URL, err.Error())), + ResponseBody: []byte(fmt.Sprintf(`Failed to read response body for request for URL : %q. Error: %v`, postInfo.URL, err)), } } network.logger.Debug(postInfo.URL, " : ", req.Proto, " : ", resp.Proto, resp.ProtoMajor, resp.ProtoMinor, resp.ProtoAtLeast) diff --git a/services/dedup/mirrorBadger/mirrorBadger_test.go b/services/dedup/mirrorBadger/mirrorBadger_test.go index 0159f230ec..d37315a605 100644 --- a/services/dedup/mirrorBadger/mirrorBadger_test.go +++ b/services/dedup/mirrorBadger/mirrorBadger_test.go @@ -49,6 +49,12 @@ func Test_MirrorBadger(t *testing.T) { found, _, err = mirrorBadger.Get(key2) require.Nil(t, err) require.False(t, found) + found, _, err = mirrorBadger.scylla.Get(key1) + require.Nil(t, err) + require.False(t, found) + found, _, err = mirrorBadger.badger.Get(key1) + require.Nil(t, err) + require.False(t, found) }) t.Run("Same messageID should be deduped from cache", func(t *testing.T) { key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} diff --git a/services/dedup/mirrorScylla/mirrorScylla_test.go b/services/dedup/mirrorScylla/mirrorScylla_test.go index 57b041429c..fed7d70206 100644 --- a/services/dedup/mirrorScylla/mirrorScylla_test.go +++ b/services/dedup/mirrorScylla/mirrorScylla_test.go @@ -63,6 +63,12 @@ func Test_MirrorBadger(t *testing.T) { found, _, err = mirrorScylla.Get(key2) require.Nil(t, err) require.False(t, found) + found, _, err = mirrorScylla.scylla.Get(key1) + require.Nil(t, err) + require.False(t, found) + found, _, err = mirrorScylla.badger.Get(key1) + require.Nil(t, err) + require.False(t, found) }) t.Run("Same messageID should be deduped for same workspace from cache", func(t *testing.T) { key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} From 1eba0cbda30e9a8b2e092e377d7574f953a23f0c Mon Sep 17 00:00:00 2001 From: Rohith BCS Date: Tue, 13 Aug 2024 12:23:04 +0530 Subject: [PATCH 16/16] chore: review comments --- processor/processor.go | 2 +- services/dedup/badger/badger_test.go | 8 ++++---- services/dedup/dedup_test.go | 20 +++++++++---------- services/dedup/mirrorBadger/mirrorBadger.go | 7 ++++++- .../dedup/mirrorBadger/mirrorBadger_test.go | 10 +++++----- services/dedup/mirrorScylla/mirrorScylla.go | 7 ++++++- .../dedup/mirrorScylla/mirrorScylla_test.go | 14 ++++++------- services/dedup/scylla/scylla.go | 18 ++++++++--------- services/dedup/scylla/scylla_test.go | 20 +++++++++---------- services/dedup/types/types.go | 2 +- 10 files changed, 59 insertions(+), 49 deletions(-) diff --git a/processor/processor.go b/processor/processor.go index 3783a33a71..575737af21 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -1713,7 +1713,7 @@ func (proc *Handle) processJobsForDest(partition string, subJobs subJob) *transf p := payloadFunc() messageSize := int64(len(p)) dedupKey := fmt.Sprintf("%v%v", messageId, eventParams.SourceJobRunId) - ok, previousSize, err := proc.dedup.Get(dedupTypes.KeyValue{Key: dedupKey, Value: messageSize, WorkspaceId: batchEvent.WorkspaceId}) + ok, previousSize, err := proc.dedup.Get(dedupTypes.KeyValue{Key: dedupKey, Value: messageSize, WorkspaceID: batchEvent.WorkspaceId}) if err != nil { panic(err) } diff --git a/services/dedup/badger/badger_test.go b/services/dedup/badger/badger_test.go index 81271b0cd7..f97bb4ced7 100644 --- a/services/dedup/badger/badger_test.go +++ b/services/dedup/badger/badger_test.go @@ -24,8 +24,8 @@ func Test_Badger(t *testing.T) { require.NotNil(t, badger) defer badger.Close() t.Run("Same messageID should be deduped from badger", func(t *testing.T) { - key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} - key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test"} notAvailable, _, err := badger.Get(key1) require.NoError(t, err) require.True(t, notAvailable) @@ -36,8 +36,8 @@ func Test_Badger(t *testing.T) { require.False(t, notAvailable) }) t.Run("Same messageID should be deduped from cache", func(t *testing.T) { - key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} - key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceID: "test"} + key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceID: "test"} found, _, err := badger.Get(key1) require.NoError(t, err) require.True(t, found) diff --git a/services/dedup/dedup_test.go b/services/dedup/dedup_test.go index 7b5bd79a9b..afe98ae369 100644 --- a/services/dedup/dedup_test.go +++ b/services/dedup/dedup_test.go @@ -67,33 +67,33 @@ func Test_Dedup(t *testing.T) { defer d.Close() t.Run("if message id is not present in cache and badger db", func(t *testing.T) { - found, _, err := d.Get(types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"}) + found, _, err := d.Get(types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test"}) require.NoError(t, err) require.Equal(t, true, found) // Checking it again should give us the previous value from the cache - found, value, err := d.Get(types.KeyValue{Key: "a", Value: 2, WorkspaceId: "test"}) + found, value, err := d.Get(types.KeyValue{Key: "a", Value: 2, WorkspaceID: "test"}) require.Nil(t, err) require.Equal(t, false, found) require.Equal(t, int64(1), value) }) t.Run("if message is committed, previous value should always return", func(t *testing.T) { - found, _, err := d.Get(types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"}) + found, _, err := d.Get(types.KeyValue{Key: "b", Value: 1, WorkspaceID: "test"}) require.Nil(t, err) require.Equal(t, true, found) err = d.Commit([]string{"a"}) require.NoError(t, err) - found, value, err := d.Get(types.KeyValue{Key: "b", Value: 2, WorkspaceId: "test"}) + found, value, err := d.Get(types.KeyValue{Key: "b", Value: 2, WorkspaceID: "test"}) require.Nil(t, err) require.Equal(t, false, found) require.Equal(t, int64(1), value) }) t.Run("committing a messageid not present in cache", func(t *testing.T) { - found, _, err := d.Get(types.KeyValue{Key: "c", Value: 1, WorkspaceId: "test"}) + found, _, err := d.Get(types.KeyValue{Key: "c", Value: 1, WorkspaceID: "test"}) require.Nil(t, err) require.Equal(t, true, found) @@ -119,19 +119,19 @@ func Test_Dedup_Window(t *testing.T) { require.Nil(t, err) defer d.Close() - found, _, err := d.Get(types.KeyValue{Key: "to be deleted", Value: 1, WorkspaceId: "test"}) + found, _, err := d.Get(types.KeyValue{Key: "to be deleted", Value: 1, WorkspaceID: "test"}) require.Nil(t, err) require.Equal(t, true, found) err = d.Commit([]string{"to be deleted"}) require.NoError(t, err) - found, _, err = d.Get(types.KeyValue{Key: "to be deleted", Value: 2, WorkspaceId: "test"}) + found, _, err = d.Get(types.KeyValue{Key: "to be deleted", Value: 2, WorkspaceID: "test"}) require.Nil(t, err) require.Equal(t, false, found) require.Eventually(t, func() bool { - found, _, err = d.Get(types.KeyValue{Key: "to be deleted", Value: 3, WorkspaceId: "test"}) + found, _, err = d.Get(types.KeyValue{Key: "to be deleted", Value: 3, WorkspaceID: "test"}) require.Nil(t, err) return found }, 2*time.Second, 100*time.Millisecond) @@ -155,7 +155,7 @@ func Test_Dedup_ErrTxnTooBig(t *testing.T) { for i := 0; i < size; i++ { key := uuid.New().String() messages[i] = key - _, _, _ = d.Get(types.KeyValue{Key: key, Value: int64(i + 1), WorkspaceId: "test"}) + _, _, _ = d.Get(types.KeyValue{Key: key, Value: int64(i + 1), WorkspaceID: "test"}) } err = d.Commit(messages) require.NoError(t, err) @@ -185,7 +185,7 @@ func Benchmark_Dedup(b *testing.B) { msgIDs[i%batchSize] = types.KeyValue{ Key: key, Value: int64(i + 1), - WorkspaceId: "test", + WorkspaceID: "test", } keys = append(keys, key) if i%batchSize == batchSize-1 || i == b.N-1 { diff --git a/services/dedup/mirrorBadger/mirrorBadger.go b/services/dedup/mirrorBadger/mirrorBadger.go index efcd3a6dcd..a3c5f4fe45 100644 --- a/services/dedup/mirrorBadger/mirrorBadger.go +++ b/services/dedup/mirrorBadger/mirrorBadger.go @@ -11,6 +11,7 @@ import ( type MirrorBadger struct { badger *badger.Dedup scylla *scylla.ScyllaDB + stat stats.Stats } func NewMirrorBadger(conf *config.Config, stats stats.Stats) (*MirrorBadger, error) { @@ -22,6 +23,7 @@ func NewMirrorBadger(conf *config.Config, stats stats.Stats) (*MirrorBadger, err return &MirrorBadger{ badger: badger, scylla: scylla, + stat: stats, }, nil } @@ -31,7 +33,10 @@ func (mb *MirrorBadger) Close() { } func (mb *MirrorBadger) Get(kv types.KeyValue) (bool, int64, error) { - _, _, _ = mb.scylla.Get(kv) + _, _, err := mb.scylla.Get(kv) + if err != nil { + mb.stat.NewTaggedStat("dedup_mirror_badger_get_error", stats.CountType, stats.Tags{}).Increment() + } return mb.badger.Get(kv) } diff --git a/services/dedup/mirrorBadger/mirrorBadger_test.go b/services/dedup/mirrorBadger/mirrorBadger_test.go index d37315a605..bff74475b3 100644 --- a/services/dedup/mirrorBadger/mirrorBadger_test.go +++ b/services/dedup/mirrorBadger/mirrorBadger_test.go @@ -22,7 +22,7 @@ func Test_MirrorBadger(t *testing.T) { logger.Reset() misc.Init() - dbPath := os.TempDir() + "/dedup_test" + dbPath := t.TempDir() + "/dedup_test" defer func() { _ = os.RemoveAll(dbPath) }() _ = os.RemoveAll(dbPath) conf := config.New() @@ -39,8 +39,8 @@ func Test_MirrorBadger(t *testing.T) { require.NotNil(t, mirrorBadger) defer mirrorBadger.Close() t.Run("Same messageID should be deduped from badger", func(t *testing.T) { - key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} - key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test"} found, _, err := mirrorBadger.Get(key1) require.Nil(t, err) require.True(t, found) @@ -57,8 +57,8 @@ func Test_MirrorBadger(t *testing.T) { require.False(t, found) }) t.Run("Same messageID should be deduped from cache", func(t *testing.T) { - key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} - key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceID: "test"} + key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceID: "test"} found, _, err := mirrorBadger.Get(key1) require.Nil(t, err) require.True(t, found) diff --git a/services/dedup/mirrorScylla/mirrorScylla.go b/services/dedup/mirrorScylla/mirrorScylla.go index 751534f44b..918a0850aa 100644 --- a/services/dedup/mirrorScylla/mirrorScylla.go +++ b/services/dedup/mirrorScylla/mirrorScylla.go @@ -11,6 +11,7 @@ import ( type MirrorScylla struct { badger *badger.Dedup scylla *scylla.ScyllaDB + stat stats.Stats } func NewMirrorScylla(conf *config.Config, stats stats.Stats) (*MirrorScylla, error) { @@ -20,6 +21,7 @@ func NewMirrorScylla(conf *config.Config, stats stats.Stats) (*MirrorScylla, err return nil, err } return &MirrorScylla{ + stat: stats, badger: badger, scylla: scylla, }, nil @@ -31,7 +33,10 @@ func (ms *MirrorScylla) Close() { } func (ms *MirrorScylla) Get(kv types.KeyValue) (bool, int64, error) { - _, _, _ = ms.badger.Get(kv) + _, _, err := ms.badger.Get(kv) + if err != nil { + ms.stat.NewTaggedStat("dedup_mirror_scylla_get_error", stats.CountType, stats.Tags{}).Increment() + } return ms.scylla.Get(kv) } diff --git a/services/dedup/mirrorScylla/mirrorScylla_test.go b/services/dedup/mirrorScylla/mirrorScylla_test.go index fed7d70206..6d8e28f823 100644 --- a/services/dedup/mirrorScylla/mirrorScylla_test.go +++ b/services/dedup/mirrorScylla/mirrorScylla_test.go @@ -22,7 +22,7 @@ func Test_MirrorBadger(t *testing.T) { logger.Reset() misc.Init() - dbPath := os.TempDir() + "/dedup_test" + dbPath := t.TempDir() + "/dedup_test" defer func() { _ = os.RemoveAll(dbPath) }() _ = os.RemoveAll(dbPath) conf := config.New() @@ -39,8 +39,8 @@ func Test_MirrorBadger(t *testing.T) { require.NotNil(t, mirrorScylla) defer mirrorScylla.Close() t.Run("Same messageID should not be deduped for different workspace", func(t *testing.T) { - key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test1"} - key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test2"} + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test1"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test2"} found, _, err := mirrorScylla.Get(key1) require.Nil(t, err) require.True(t, found) @@ -53,8 +53,8 @@ func Test_MirrorBadger(t *testing.T) { require.NoError(t, err) }) t.Run("Same messageID should be deduped for same workspace", func(t *testing.T) { - key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} - key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test"} found, _, err := mirrorScylla.Get(key1) require.Nil(t, err) require.True(t, found) @@ -71,8 +71,8 @@ func Test_MirrorBadger(t *testing.T) { require.False(t, found) }) t.Run("Same messageID should be deduped for same workspace from cache", func(t *testing.T) { - key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} - key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceID: "test"} + key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceID: "test"} found, _, err := mirrorScylla.Get(key1) require.Nil(t, err) require.True(t, found) diff --git a/services/dedup/scylla/scylla.go b/services/dedup/scylla/scylla.go index 10e80d98c6..51ca33ca97 100644 --- a/services/dedup/scylla/scylla.go +++ b/services/dedup/scylla/scylla.go @@ -38,18 +38,18 @@ func (d *ScyllaDB) Get(kv types.KeyValue) (bool, int64, error) { // Create the table if it doesn't exist var err error d.createTableMu.Lock() - once, found := d.createTableMap[kv.WorkspaceId] + once, found := d.createTableMap[kv.WorkspaceID] if !found { - d.createTableMap[kv.WorkspaceId] = &sync.Once{} - once = d.createTableMap[kv.WorkspaceId] + d.createTableMap[kv.WorkspaceID] = &sync.Once{} + once = d.createTableMap[kv.WorkspaceID] } d.createTableMu.Unlock() once.Do(func() { - query := fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s.%s (id text PRIMARY KEY,size bigint) WITH bloom_filter_fp_chance = 0.005;", d.keyspace, kv.WorkspaceId) + query := fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s.%s (id text PRIMARY KEY,size bigint) WITH bloom_filter_fp_chance = 0.005;", d.keyspace, kv.WorkspaceID) err = d.scylla.Query(query).Exec() }) if err != nil { - return false, 0, fmt.Errorf("error creating table %s: %v", kv.WorkspaceId, err) + return false, 0, fmt.Errorf("creating table %s: %v", kv.WorkspaceID, err) } d.cacheMu.Lock() @@ -63,7 +63,7 @@ func (d *ScyllaDB) Get(kv types.KeyValue) (bool, int64, error) { // Check if the key exists in the DB var value int64 - err = d.scylla.Query(fmt.Sprintf("SELECT size FROM %s.%s WHERE id = ?", d.keyspace, kv.WorkspaceId), kv.Key).Scan(&value) + err = d.scylla.Query(fmt.Sprintf("SELECT size FROM %s.%s WHERE id = ?", d.keyspace, kv.WorkspaceID), kv.Key).Scan(&value) if err != nil && !errors.Is(err, gocql.ErrNotFound) { return false, 0, fmt.Errorf("error getting key %s: %v", kv.Key, err) } @@ -83,11 +83,11 @@ func (d *ScyllaDB) Commit(keys []string) error { d.cacheMu.Unlock() return fmt.Errorf("key %v has not been previously set", key) } - kvs[i] = types.KeyValue{Key: key, Value: value.Value, WorkspaceId: value.WorkspaceId} + kvs[i] = types.KeyValue{Key: key, Value: value.Value, WorkspaceID: value.WorkspaceID} } d.cacheMu.Unlock() keysList := lo.PartitionBy(kvs, func(kv types.KeyValue) string { - return kv.WorkspaceId + return kv.WorkspaceID }) for _, keysPerWorkspace := range keysList { batches := lo.Chunk(keysPerWorkspace, d.batchSize) @@ -95,7 +95,7 @@ func (d *ScyllaDB) Commit(keys []string) error { scyllaBatch := d.scylla.NewBatch(gocql.LoggedBatch) for _, key := range batch { scyllaBatch.Entries = append(scyllaBatch.Entries, gocql.BatchEntry{ - Stmt: fmt.Sprintf("INSERT INTO %s.%s (id,size) VALUES (?,?) USING TTL %d", d.keyspace, key.WorkspaceId, d.ttl), + Stmt: fmt.Sprintf("INSERT INTO %s.%s (id,size) VALUES (?,?) USING TTL %d", d.keyspace, key.WorkspaceID, d.ttl), Args: []interface{}{key.Key, key.Value}, }) } diff --git a/services/dedup/scylla/scylla_test.go b/services/dedup/scylla/scylla_test.go index d695a8c366..f4acbb7d56 100644 --- a/services/dedup/scylla/scylla_test.go +++ b/services/dedup/scylla/scylla_test.go @@ -28,8 +28,8 @@ func Test_Scylla(t *testing.T) { require.NotNil(t, scylla) defer scylla.Close() t.Run("Same messageID should not be deduped for different workspace", func(t *testing.T) { - key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test1"} - key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test2"} + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test1"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test2"} found, _, err := scylla.Get(key1) require.Nil(t, err) require.True(t, found) @@ -42,25 +42,25 @@ func Test_Scylla(t *testing.T) { require.NoError(t, err) }) t.Run("Same messageID should be deduped for same workspace", func(t *testing.T) { - key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} - key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceId: "test"} + key1 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test"} + key2 := types.KeyValue{Key: "a", Value: 1, WorkspaceID: "test"} found, _, err := scylla.Get(key1) - require.Nil(t, err) + require.NoError(t, err) require.True(t, found) err = scylla.Commit([]string{key1.Key}) require.NoError(t, err) found, _, err = scylla.Get(key2) - require.Nil(t, err) + require.NoError(t, err) require.False(t, found) }) t.Run("Same messageID should be deduped for same workspace from cache", func(t *testing.T) { - key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} - key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceId: "test"} + key1 := types.KeyValue{Key: "b", Value: 1, WorkspaceID: "test"} + key2 := types.KeyValue{Key: "b", Value: 1, WorkspaceID: "test"} found, _, err := scylla.Get(key1) - require.Nil(t, err) + require.NoError(t, err) require.True(t, found) found, _, err = scylla.Get(key2) - require.Nil(t, err) + require.NoError(t, err) require.False(t, found) }) } diff --git a/services/dedup/types/types.go b/services/dedup/types/types.go index 7ca5fcceec..96f313fa1f 100644 --- a/services/dedup/types/types.go +++ b/services/dedup/types/types.go @@ -3,5 +3,5 @@ package types type KeyValue struct { Key string Value int64 - WorkspaceId string + WorkspaceID string }