Skip to content

Commit

Permalink
fix flaky test: TestSpacemeshApp_NodeService (#4728)
Browse files Browse the repository at this point in the history
## Motivation
Closes: #4729

## Changes
- The app during initialization for set the logging level to `zapcore.InfoLevel` which overwrites what ever logging configuration is passed in via `WithLog`. This was fixed, the app still starts up with `zapcore.InfoLevel` by default.
- There has been an issue with logging in tests because the logging middleware for GRPC wasn't unset when the test ended which could result in components logging after the test ended and thereby failing a test. This is now done in `app.stopServices` (since it is set up in `app.startServices`)
- After making logs visible the issue was identified: the node is shut down while it verifies its own post proof. This only happens on the `macos-latest` runner because other runners might not have completed their initial post proof yet or are already done verifying it by that time.
   - Changed verifying to only emit and log errors when an error other than `context.Cancelled` is returned.
- These changes allowed me to run the test 100 times in a row without it failing a single time:

```
go test -v -timeout 150s -run ^TestSpacemeshApp_NodeService$ github.com/spacemeshos/go-spacemesh/node -count 100
```

passes without a single iteration of the test failing.

## Test Plan
n/a

## TODO
<!-- This section should be removed when all items are complete -->
- [x] Explain motivation or link existing issue(s)
- [x] Test changes and document test plan
- [x] Update documentation as needed

## DevOps Notes
<!-- Please uncheck these items as applicable to make DevOps aware of changes that may affect releases -->
- [x] This PR does not require configuration changes (e.g., environment variables, GitHub secrets, VM resources)
- [x] This PR does not affect public APIs
- [x] This PR does not rely on a new version of external services (PoET, elasticsearch, etc.)
- [x] This PR does not make changes to log messages (which monitoring infrastructure may rely on)
  • Loading branch information
fasmat committed Jul 20, 2023
1 parent 68d196b commit e91d735
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 14 deletions.
12 changes: 9 additions & 3 deletions activation/activation.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,13 +320,19 @@ func (b *Builder) verifyInitialPost(ctx context.Context, post *types.Post, metad
if err != nil {
b.log.With().Panic("failed to fetch commitment ATX ID.", log.Err(err))
}
if err := b.validator.Post(ctx, types.EpochID(0), b.nodeID, commitmentAtxId, post, metadata, b.postSetupProvider.LastOpts().NumUnits); err != nil {
err = b.validator.Post(ctx, types.EpochID(0), b.nodeID, commitmentAtxId, post, metadata, b.postSetupProvider.LastOpts().NumUnits)
switch {
case errors.Is(err, context.Canceled):
// If the context was canceled, we don't want to emit or log errors just propagate the cancellation signal.
return err
case err != nil:
events.EmitInvalidPostProof()
b.log.With().Fatal("initial POST proof is invalid. Probably the initialized POST data is corrupted. Please verify the data with postcli and regenerate the corrupted files.", log.Err(err))
return err
default:
b.initialPost = post
return nil
}
b.initialPost = post
return nil
}

func (b *Builder) receivePendingPoetClients() *[]PoetProvingServiceClient {
Expand Down
3 changes: 2 additions & 1 deletion node/bad_peer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ func NewApp(conf *config.Config) (*App, error) {
WithConfig(conf),
WithLog(log.RegisterHooks(
log.NewWithLevel("", zap.NewAtomicLevelAt(zapcore.DebugLevel)),
events.EventHook())),
events.EventHook()),
),
)

var err error
Expand Down
12 changes: 8 additions & 4 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,9 @@ func GetCommand() *cobra.Command {
// NOTE(dshulyak) this needs to be max level so that child logger can can be current level or below.
// otherwise it will fail later when child logger will try to increase level.
WithLog(log.RegisterHooks(
log.NewWithLevel("", zap.NewAtomicLevelAt(zapcore.DebugLevel)),
events.EventHook())),
log.NewWithLevel("node", zap.NewAtomicLevelAt(zap.InfoLevel)),
events.EventHook()),
),
)

run := func(ctx context.Context) error {
Expand Down Expand Up @@ -293,8 +294,7 @@ func New(opts ...Option) *App {
for _, opt := range opts {
opt(app)
}
lvl := zap.NewAtomicLevelAt(zap.InfoLevel)
log.SetupGlobal(app.log.SetLevel(&lvl))
log.SetupGlobal(app.log)
types.SetNetworkHRP(app.Config.NetworkHRP)
return app
}
Expand Down Expand Up @@ -1192,6 +1192,10 @@ func (app *App) stopServices(ctx context.Context) {
}

events.CloseEventReporter()
// SetGrpcLogger unfortunately is global
// this ensures that a test-logger isn't used after the app shuts down
// by e.g. a grpc connection to the node that is still open - like in TestSpacemeshApp_NodeService
grpczap.SetGrpcLoggerV2(grpclog, log.NewNop().Zap())
}

// LoadOrCreateEdSigner either loads a previously created ed identity for the node or creates a new one if not exists.
Expand Down
10 changes: 4 additions & 6 deletions node/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -410,24 +410,22 @@ func TestSpacemeshApp_JsonService(t *testing.T) {

// E2E app test of the stream endpoints in the NodeService.
func TestSpacemeshApp_NodeService(t *testing.T) {
t.Skip("flaky on macos-latest: https://github.com/spacemeshos/go-spacemesh/issues/4729")
// errlog should be used only for testing.
logger := logtest.New(t)
errlog := log.RegisterHooks(logtest.New(t, zap.ErrorLevel), events.EventHook())
logger := logtest.New(t, zapcore.ErrorLevel)
errlog := log.RegisterHooks(logger, events.EventHook()) // errlog is used to simulate errors in the app

// Use a unique port
port := 1240

app := New(WithLog(logger))
app.Config = getTestDefaultConfig(t)
app.Config.SMESHING.CoinbaseAccount = types.GenerateAddress([]byte{1}).String()
app.Config.SMESHING.Opts.DataDir, _ = os.MkdirTemp("", "sm-app-test-post-datadir")
app.Config.SMESHING.Opts.DataDir = t.TempDir()

clock, err := timesync.NewClock(
timesync.WithLayerDuration(1*time.Second),
timesync.WithTickInterval(100*time.Millisecond),
timesync.WithGenesisTime(time.Now()),
timesync.WithLogger(logtest.New(t)),
timesync.WithLogger(logger),
)
require.NoError(t, err)
app.clock = clock
Expand Down

0 comments on commit e91d735

Please sign in to comment.