retry with backoff requests to CAPI (#1957)

* backoff on refresh token error

* fix tls communication with lapi and user/pw auth (#1956)

allow self-signed TLS encryption with user/pw auth

docker:
 - remove defaults for certificate file locations
 - new envvar INSECURE_SKIP_VERIFY
 - register agent before TLS settings (cscli machine add removes them
   from the credentials file)

* separate cscli cobra constructors:  lapi, machines, bouncers, postoverflows (#1945)

* use feature toggling to improve testability with http retry backoff

* Add parse unix to dateparse enricher (#1958)

Add parse unix is we do have a strTime but wasnt parsed using convential golang time

* func tests: redirect stderr to filter extra logs (#1961)

* backoff on refresh token error

* use feature toggling to improve testability with http retry backoff

* refactor feature backoff toggle for tests

Co-authored-by: mmetc <92726601+mmetc@users.noreply.github.com>
Co-authored-by: Laurence Jones <laurence.jones@live.co.uk>
This commit is contained in:
Cristian Nitescu 2023-01-09 14:49:21 +01:00 committed by GitHub
parent a84e4b6b15
commit 7284c0a47a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 107 additions and 25 deletions

View file

@ -10,6 +10,7 @@ on:
env:
PREFIX_TEST_NAMES_WITH_FILE: true
jobs:
build:

View file

@ -16,6 +16,7 @@ on:
env:
RICHGO_FORCE_COLOR: 1
CROWDSEC_FEATURE_DISABLE_HTTP_RETRY_BACKOFF: true
jobs:

View file

@ -30,6 +30,7 @@ env:
# and to override our endpoint in aws sdk
AWS_ENDPOINT_FORCE: http://localhost:4566
KINESIS_INITIALIZE_STREAMS: "stream-1-shard:1,stream-2-shards:2"
CROWDSEC_FEATURE_DISABLE_HTTP_RETRY_BACKOFF: true
jobs:

View file

@ -3,6 +3,8 @@ package apiclient
import (
"bytes"
"encoding/json"
"math/rand"
"sync"
"time"
//"errors"
@ -12,6 +14,7 @@ import (
"net/http/httputil"
"net/url"
"github.com/crowdsecurity/crowdsec/pkg/fflag"
"github.com/crowdsecurity/crowdsec/pkg/models"
"github.com/go-openapi/strfmt"
"github.com/pkg/errors"
@ -75,6 +78,53 @@ func (t *APIKeyTransport) transport() http.RoundTripper {
return http.DefaultTransport
}
type retryRoundTripper struct {
next http.RoundTripper
maxAttempts int
retryStatusCodes []int
withBackOff bool
onBeforeRequest func(attempt int)
}
func (r retryRoundTripper) ShouldRetry(statusCode int) bool {
for _, code := range r.retryStatusCodes {
if code == statusCode {
return true
}
}
return false
}
func (r retryRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
var resp *http.Response
var err error
backoff := 0
for i := 0; i < r.maxAttempts; i++ {
if i > 0 {
if r.withBackOff && !fflag.DisableHttpRetryBackoff.IsEnabled() {
backoff += 10 + rand.Intn(20)
}
log.Infof("retrying in %d seconds (attempt %d of %d)", backoff, i+1, r.maxAttempts)
select {
case <-req.Context().Done():
return resp, req.Context().Err()
case <-time.After(time.Duration(backoff) * time.Second):
}
}
if r.onBeforeRequest != nil {
r.onBeforeRequest(i)
}
clonedReq := cloneRequest(req)
resp, err = r.next.RoundTrip(clonedReq)
if err == nil {
if !r.ShouldRetry(resp.StatusCode) {
return resp, nil
}
}
}
return resp, err
}
type JWTTransport struct {
MachineID *string
Password *strfmt.Password
@ -86,9 +136,9 @@ type JWTTransport struct {
UserAgent string
// Transport is the underlying HTTP transport to use when making requests.
// It will default to http.DefaultTransport if nil.
Transport http.RoundTripper
UpdateScenario func() ([]string, error)
NbRetry int
Transport http.RoundTripper
UpdateScenario func() ([]string, error)
refreshTokenMutex sync.Mutex
}
func (t *JWTTransport) refreshJwtToken() error {
@ -124,7 +174,14 @@ func (t *JWTTransport) refreshJwtToken() error {
return errors.Wrap(err, "could not create request")
}
req.Header.Add("Content-Type", "application/json")
client := &http.Client{}
client := &http.Client{
Transport: &retryRoundTripper{
next: http.DefaultTransport,
maxAttempts: 5,
withBackOff: true,
retryStatusCodes: []int{http.StatusTooManyRequests, http.StatusServiceUnavailable, http.StatusGatewayTimeout, http.StatusInternalServerError},
},
}
if t.UserAgent != "" {
req.Header.Add("User-Agent", t.UserAgent)
}
@ -150,6 +207,7 @@ func (t *JWTTransport) refreshJwtToken() error {
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
log.Debugf("received response status %q when fetching %v", resp.Status, req.URL)
err = CheckResponse(resp)
if err != nil {
return err
@ -170,25 +228,21 @@ func (t *JWTTransport) refreshJwtToken() error {
// RoundTrip implements the RoundTripper interface.
func (t *JWTTransport) RoundTrip(req *http.Request) (*http.Response, error) {
if t.NbRetry > 1 {
t.NbRetry = 0
return nil, fmt.Errorf("unable to refresh JWT token multiple times")
}
// in a few occasions several goroutines will execute refreshJwtToken concurrently which is useless and will cause overload on CAPI
// we use a mutex to avoid this
t.refreshTokenMutex.Lock()
if t.Token == "" || t.Expiration.Add(-time.Minute).Before(time.Now().UTC()) {
if err := t.refreshJwtToken(); err != nil {
t.refreshTokenMutex.Unlock()
return nil, err
}
}
t.refreshTokenMutex.Unlock()
if t.UserAgent != "" {
req.Header.Add("User-Agent", t.UserAgent)
}
// We must make a copy of the Request so
// that we don't modify the Request we were given. This is required by the
// specification of http.RoundTripper.
clonedReq := cloneRequest(req)
req.Header.Add("Authorization", fmt.Sprintf("Bearer %s", t.Token))
if log.GetLevel() >= log.TraceLevel {
@ -209,14 +263,6 @@ func (t *JWTTransport) RoundTrip(req *http.Request) (*http.Response, error) {
return resp, errors.Wrapf(err, "performing jwt auth")
}
if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
t.Token = ""
t.NbRetry++
return t.RoundTrip(clonedReq)
}
t.NbRetry = 0
log.Debugf("resp-jwt: %d", resp.StatusCode)
return resp, nil
@ -226,11 +272,39 @@ func (t *JWTTransport) Client() *http.Client {
return &http.Client{Transport: t}
}
func (t *JWTTransport) ResetToken() {
log.Debug("resetting jwt token")
t.refreshTokenMutex.Lock()
t.Token = ""
t.refreshTokenMutex.Unlock()
}
func (t *JWTTransport) transport() http.RoundTripper {
var transport http.RoundTripper
if t.Transport != nil {
return t.Transport
transport = t.Transport
} else {
transport = http.DefaultTransport
}
// a round tripper that retries once when the status is unauthorized and 5 times when infrastructure is overloaded
return &retryRoundTripper{
next: &retryRoundTripper{
next: transport,
maxAttempts: 5,
withBackOff: true,
retryStatusCodes: []int{http.StatusTooManyRequests, http.StatusServiceUnavailable, http.StatusGatewayTimeout},
},
maxAttempts: 2,
withBackOff: false,
retryStatusCodes: []int{http.StatusUnauthorized, http.StatusForbidden},
onBeforeRequest: func(attempt int) {
// reset the token only in the second attempt as this is when we know we had a 401 or 403
// the second attempt is supposed to refresh the token
if attempt > 0 {
t.ResetToken()
}
},
}
return http.DefaultTransport
}
// cloneRequest returns a clone of the provided *http.Request. The clone is a

View file

@ -234,5 +234,5 @@ func TestWatcherEnroll(t *testing.T) {
}
_, err = client.Auth.EnrollWatcher(context.Background(), "badkey", "", []string{}, false)
assert.Contains(t, err.Error(), "unable to refresh JWT token multiple times", "got %s", err.Error())
assert.Contains(t, err.Error(), "the attachment key provided is not valid", "got %s", err.Error())
}

View file

@ -51,7 +51,6 @@ func NewClient(config *Config) (*ApiClient, error) {
UserAgent: config.UserAgent,
VersionPrefix: config.VersionPrefix,
UpdateScenario: config.UpdateScenario,
NbRetry: 0,
}
tlsconfig := tls.Config{InsecureSkipVerify: InsecureSkipVerify}
tlsconfig.RootCAs = CaCertPool

View file

@ -3,12 +3,17 @@ package fflag
var Crowdsec = FeatureRegister{EnvPrefix: "CROWDSEC_FEATURE_"}
var CscliSetup = &Feature{Name: "cscli_setup"}
var DisableHttpRetryBackoff = &Feature{Name: "disable_http_retry_backoff", Description: "Disable http retry backoff"}
func RegisterAllFeatures() error {
err := Crowdsec.RegisterFeature(CscliSetup)
if err != nil {
return err
}
err = Crowdsec.RegisterFeature(DisableHttpRetryBackoff)
if err != nil {
return err
}
return nil
}

View file

@ -9,3 +9,4 @@ load "../lib/bats-assert/load.bash"
# mark the start of each test in the logs, beware crowdsec might be running
# echo "time=\"$(date +"%d-%m-%Y %H:%M:%S")\" level=info msg=\"TEST: ${BATS_TEST_DESCRIPTION}\"" >> /var/log/crowdsec.log
export CROWDSEC_FEATURE_DISABLE_HTTP_RETRY_BACKOFF=true