From 7284c0a47afcad991314cfc0336a914e6b9d65cf Mon Sep 17 00:00:00 2001 From: Cristian Nitescu Date: Mon, 9 Jan 2023 14:49:21 +0100 Subject: [PATCH] retry with backoff requests to CAPI (#1957) * backoff on refresh token error * fix tls communication with lapi and user/pw auth (#1956) allow self-signed TLS encryption with user/pw auth docker: - remove defaults for certificate file locations - new envvar INSECURE_SKIP_VERIFY - register agent before TLS settings (cscli machine add removes them from the credentials file) * separate cscli cobra constructors: lapi, machines, bouncers, postoverflows (#1945) * use feature toggling to improve testability with http retry backoff * Add parse unix to dateparse enricher (#1958) Add parse unix is we do have a strTime but wasnt parsed using convential golang time * func tests: redirect stderr to filter extra logs (#1961) * backoff on refresh token error * use feature toggling to improve testability with http retry backoff * refactor feature backoff toggle for tests Co-authored-by: mmetc <92726601+mmetc@users.noreply.github.com> Co-authored-by: Laurence Jones --- .github/workflows/bats-mysql.yml | 1 + .github/workflows/go-tests-windows.yml | 1 + .github/workflows/go-tests.yml | 1 + pkg/apiclient/auth.go | 120 ++++++++++++++++++++----- pkg/apiclient/auth_service_test.go | 2 +- pkg/apiclient/client.go | 1 - pkg/fflag/crowdsec.go | 5 ++ tests/lib/setup.sh | 1 + 8 files changed, 107 insertions(+), 25 deletions(-) diff --git a/.github/workflows/bats-mysql.yml b/.github/workflows/bats-mysql.yml index 023a128fb..f398fd886 100644 --- a/.github/workflows/bats-mysql.yml +++ b/.github/workflows/bats-mysql.yml @@ -10,6 +10,7 @@ on: env: PREFIX_TEST_NAMES_WITH_FILE: true + jobs: build: diff --git a/.github/workflows/go-tests-windows.yml b/.github/workflows/go-tests-windows.yml index eb45f7e48..4d9e176af 100644 --- a/.github/workflows/go-tests-windows.yml +++ b/.github/workflows/go-tests-windows.yml @@ -16,6 +16,7 @@ on: env: RICHGO_FORCE_COLOR: 1 + CROWDSEC_FEATURE_DISABLE_HTTP_RETRY_BACKOFF: true jobs: diff --git a/.github/workflows/go-tests.yml b/.github/workflows/go-tests.yml index 6af27ada8..09f076afa 100644 --- a/.github/workflows/go-tests.yml +++ b/.github/workflows/go-tests.yml @@ -30,6 +30,7 @@ env: # and to override our endpoint in aws sdk AWS_ENDPOINT_FORCE: http://localhost:4566 KINESIS_INITIALIZE_STREAMS: "stream-1-shard:1,stream-2-shards:2" + CROWDSEC_FEATURE_DISABLE_HTTP_RETRY_BACKOFF: true jobs: diff --git a/pkg/apiclient/auth.go b/pkg/apiclient/auth.go index 747fa76d3..dc140f3ac 100644 --- a/pkg/apiclient/auth.go +++ b/pkg/apiclient/auth.go @@ -3,6 +3,8 @@ package apiclient import ( "bytes" "encoding/json" + "math/rand" + "sync" "time" //"errors" @@ -12,6 +14,7 @@ import ( "net/http/httputil" "net/url" + "github.com/crowdsecurity/crowdsec/pkg/fflag" "github.com/crowdsecurity/crowdsec/pkg/models" "github.com/go-openapi/strfmt" "github.com/pkg/errors" @@ -75,6 +78,53 @@ func (t *APIKeyTransport) transport() http.RoundTripper { return http.DefaultTransport } +type retryRoundTripper struct { + next http.RoundTripper + maxAttempts int + retryStatusCodes []int + withBackOff bool + onBeforeRequest func(attempt int) +} + +func (r retryRoundTripper) ShouldRetry(statusCode int) bool { + for _, code := range r.retryStatusCodes { + if code == statusCode { + return true + } + } + return false +} + +func (r retryRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + var resp *http.Response + var err error + backoff := 0 + for i := 0; i < r.maxAttempts; i++ { + if i > 0 { + if r.withBackOff && !fflag.DisableHttpRetryBackoff.IsEnabled() { + backoff += 10 + rand.Intn(20) + } + log.Infof("retrying in %d seconds (attempt %d of %d)", backoff, i+1, r.maxAttempts) + select { + case <-req.Context().Done(): + return resp, req.Context().Err() + case <-time.After(time.Duration(backoff) * time.Second): + } + } + if r.onBeforeRequest != nil { + r.onBeforeRequest(i) + } + clonedReq := cloneRequest(req) + resp, err = r.next.RoundTrip(clonedReq) + if err == nil { + if !r.ShouldRetry(resp.StatusCode) { + return resp, nil + } + } + } + return resp, err +} + type JWTTransport struct { MachineID *string Password *strfmt.Password @@ -86,9 +136,9 @@ type JWTTransport struct { UserAgent string // Transport is the underlying HTTP transport to use when making requests. // It will default to http.DefaultTransport if nil. - Transport http.RoundTripper - UpdateScenario func() ([]string, error) - NbRetry int + Transport http.RoundTripper + UpdateScenario func() ([]string, error) + refreshTokenMutex sync.Mutex } func (t *JWTTransport) refreshJwtToken() error { @@ -124,7 +174,14 @@ func (t *JWTTransport) refreshJwtToken() error { return errors.Wrap(err, "could not create request") } req.Header.Add("Content-Type", "application/json") - client := &http.Client{} + client := &http.Client{ + Transport: &retryRoundTripper{ + next: http.DefaultTransport, + maxAttempts: 5, + withBackOff: true, + retryStatusCodes: []int{http.StatusTooManyRequests, http.StatusServiceUnavailable, http.StatusGatewayTimeout, http.StatusInternalServerError}, + }, + } if t.UserAgent != "" { req.Header.Add("User-Agent", t.UserAgent) } @@ -150,6 +207,7 @@ func (t *JWTTransport) refreshJwtToken() error { if resp.StatusCode < 200 || resp.StatusCode >= 300 { log.Debugf("received response status %q when fetching %v", resp.Status, req.URL) + err = CheckResponse(resp) if err != nil { return err @@ -170,25 +228,21 @@ func (t *JWTTransport) refreshJwtToken() error { // RoundTrip implements the RoundTripper interface. func (t *JWTTransport) RoundTrip(req *http.Request) (*http.Response, error) { - if t.NbRetry > 1 { - t.NbRetry = 0 - return nil, fmt.Errorf("unable to refresh JWT token multiple times") - } + // in a few occasions several goroutines will execute refreshJwtToken concurrently which is useless and will cause overload on CAPI + // we use a mutex to avoid this + t.refreshTokenMutex.Lock() if t.Token == "" || t.Expiration.Add(-time.Minute).Before(time.Now().UTC()) { if err := t.refreshJwtToken(); err != nil { + t.refreshTokenMutex.Unlock() return nil, err } } + t.refreshTokenMutex.Unlock() if t.UserAgent != "" { req.Header.Add("User-Agent", t.UserAgent) } - // We must make a copy of the Request so - // that we don't modify the Request we were given. This is required by the - // specification of http.RoundTripper. - clonedReq := cloneRequest(req) - req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", t.Token)) if log.GetLevel() >= log.TraceLevel { @@ -209,14 +263,6 @@ func (t *JWTTransport) RoundTrip(req *http.Request) (*http.Response, error) { return resp, errors.Wrapf(err, "performing jwt auth") } - if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { - t.Token = "" - t.NbRetry++ - return t.RoundTrip(clonedReq) - } - - t.NbRetry = 0 - log.Debugf("resp-jwt: %d", resp.StatusCode) return resp, nil @@ -226,11 +272,39 @@ func (t *JWTTransport) Client() *http.Client { return &http.Client{Transport: t} } +func (t *JWTTransport) ResetToken() { + log.Debug("resetting jwt token") + t.refreshTokenMutex.Lock() + t.Token = "" + t.refreshTokenMutex.Unlock() +} + func (t *JWTTransport) transport() http.RoundTripper { + var transport http.RoundTripper if t.Transport != nil { - return t.Transport + transport = t.Transport + } else { + transport = http.DefaultTransport + } + // a round tripper that retries once when the status is unauthorized and 5 times when infrastructure is overloaded + return &retryRoundTripper{ + next: &retryRoundTripper{ + next: transport, + maxAttempts: 5, + withBackOff: true, + retryStatusCodes: []int{http.StatusTooManyRequests, http.StatusServiceUnavailable, http.StatusGatewayTimeout}, + }, + maxAttempts: 2, + withBackOff: false, + retryStatusCodes: []int{http.StatusUnauthorized, http.StatusForbidden}, + onBeforeRequest: func(attempt int) { + // reset the token only in the second attempt as this is when we know we had a 401 or 403 + // the second attempt is supposed to refresh the token + if attempt > 0 { + t.ResetToken() + } + }, } - return http.DefaultTransport } // cloneRequest returns a clone of the provided *http.Request. The clone is a diff --git a/pkg/apiclient/auth_service_test.go b/pkg/apiclient/auth_service_test.go index f671868a8..515b684cd 100644 --- a/pkg/apiclient/auth_service_test.go +++ b/pkg/apiclient/auth_service_test.go @@ -234,5 +234,5 @@ func TestWatcherEnroll(t *testing.T) { } _, err = client.Auth.EnrollWatcher(context.Background(), "badkey", "", []string{}, false) - assert.Contains(t, err.Error(), "unable to refresh JWT token multiple times", "got %s", err.Error()) + assert.Contains(t, err.Error(), "the attachment key provided is not valid", "got %s", err.Error()) } diff --git a/pkg/apiclient/client.go b/pkg/apiclient/client.go index 4cead4ec2..9f431b49e 100644 --- a/pkg/apiclient/client.go +++ b/pkg/apiclient/client.go @@ -51,7 +51,6 @@ func NewClient(config *Config) (*ApiClient, error) { UserAgent: config.UserAgent, VersionPrefix: config.VersionPrefix, UpdateScenario: config.UpdateScenario, - NbRetry: 0, } tlsconfig := tls.Config{InsecureSkipVerify: InsecureSkipVerify} tlsconfig.RootCAs = CaCertPool diff --git a/pkg/fflag/crowdsec.go b/pkg/fflag/crowdsec.go index 89b1fed90..25e654682 100644 --- a/pkg/fflag/crowdsec.go +++ b/pkg/fflag/crowdsec.go @@ -3,12 +3,17 @@ package fflag var Crowdsec = FeatureRegister{EnvPrefix: "CROWDSEC_FEATURE_"} var CscliSetup = &Feature{Name: "cscli_setup"} +var DisableHttpRetryBackoff = &Feature{Name: "disable_http_retry_backoff", Description: "Disable http retry backoff"} func RegisterAllFeatures() error { err := Crowdsec.RegisterFeature(CscliSetup) if err != nil { return err } + err = Crowdsec.RegisterFeature(DisableHttpRetryBackoff) + if err != nil { + return err + } return nil } diff --git a/tests/lib/setup.sh b/tests/lib/setup.sh index 03d207d87..f322c2353 100755 --- a/tests/lib/setup.sh +++ b/tests/lib/setup.sh @@ -9,3 +9,4 @@ load "../lib/bats-assert/load.bash" # mark the start of each test in the logs, beware crowdsec might be running # echo "time=\"$(date +"%d-%m-%Y %H:%M:%S")\" level=info msg=\"TEST: ${BATS_TEST_DESCRIPTION}\"" >> /var/log/crowdsec.log +export CROWDSEC_FEATURE_DISABLE_HTTP_RETRY_BACKOFF=true \ No newline at end of file