Add bayesian bucket type (#2290)

2023-06-21 15:08:27 +02:00 · 2023-06-21 15:08:27 +02:00 · 40e6b205bc
parent da6106bd23
commit 40e6b205bc
14 changed files with 471 additions and 0 deletions
--- a/pkg/leakybucket/README.md
+++ b/pkg/leakybucket/README.md
@ -48,6 +48,15 @@ infinite leakspeed (it never overflows, nor leaks). Nevertheless,
 the event is raised after a fixed duration. The option is called
 duration.
 ## Bayesian
 A Bayesian is a special bucket that runs bayesian inference instead of 
 counting events. Each event must have its likelihoods specified in the
 yaml file under `prob_given_benign` and `prob_given_evil`. The bucket
 will continue evaluating events until the posterior goes above the 
 threshold (triggering the overflow) or the duration (specified by leakspeed)
 expires.
 ## Available configuration options for buckets
 ### Fields for standard buckets
@ -102,6 +111,22 @@ Capacity and leakspeed are not relevant for this kind of bucket.
   Nevertheless, this kind of bucket is often used with an infinite
   leakspeed and an infinite capacity [capacity set to -1 for now].
 #### Bayesian
 * bayesian_prior: The prior to start with
 * bayesian_threshold: The threshold for the posterior to trigger the overflow.
 * bayesian_conditions: List of Bayesian conditions with likelihoods
 Bayesian Conditions are built from:
 * condition: The expr for this specific condition to be true
 * prob_given_evil: The likelihood an IP satisfies the condition given the fact
   that it is a maliscious IP
 * prob_given_benign: The likelihood an IP satisfies the condition given the fact
   that it is a benign IP
 * guillotine: Bool to stop the condition from getting evaluated if it has
   evaluated to true once. This should be used if evaluating the condition is 
   computationally expensive. 
 ## Add examples here
--- a/pkg/leakybucket/bayesian.go
+++ b/pkg/leakybucket/bayesian.go
@ -0,0 +1,163 @@
 package leakybucket
 import (
 	"fmt"
 	"github.com/antonmedv/expr"
 	"github.com/antonmedv/expr/vm"
 	"github.com/crowdsecurity/crowdsec/pkg/exprhelpers"
 	"github.com/crowdsecurity/crowdsec/pkg/types"
 )
 type RawBayesianCondition struct {
 	ConditionalFilterName string  `yaml:"condition"`
 	ProbGivenEvil         float32 `yaml:"prob_given_evil"`
 	ProbGivenBenign       float32 `yaml:"prob_given_benign"`
 	Guillotine            bool    `yaml:"guillotine,omitempty"`
 }
 type BayesianEvent struct {
 	rawCondition             RawBayesianCondition
 	conditionalFilterRuntime *vm.Program
 	guillotineState          bool
 }
 type BayesianBucket struct {
 	bayesianEventArray []*BayesianEvent
 	prior              float32
 	threshold          float32
 	posterior          float32
 	DumbProcessor
 }
 func updateProbability(prior, probGivenEvil, ProbGivenBenign float32) float32 {
 	numerator := probGivenEvil * prior
 	denominator := numerator + ProbGivenBenign*(1-prior)
 	return numerator / denominator
 }
 func (c *BayesianBucket) OnBucketInit(g *BucketFactory) error {
 	var err error
 	BayesianEventArray := make([]*BayesianEvent, len(g.BayesianConditions))
 	if conditionalExprCache == nil {
 		conditionalExprCache = make(map[string]vm.Program)
 	}
 	conditionalExprCacheLock.Lock()
 	for index, bcond := range g.BayesianConditions {
 		var bayesianEvent BayesianEvent
 		bayesianEvent.rawCondition = bcond
 		err = bayesianEvent.compileCondition()
 		if err != nil {
 			return err
 		}
 		BayesianEventArray[index] = &bayesianEvent
 	}
 	conditionalExprCacheLock.Unlock()
 	c.bayesianEventArray = BayesianEventArray
 	c.prior = g.BayesianPrior
 	c.threshold = g.BayesianThreshold
 	return err
 }
 func (c *BayesianBucket) AfterBucketPour(b *BucketFactory) func(types.Event, *Leaky) *types.Event {
 	return func(msg types.Event, l *Leaky) *types.Event {
 		c.posterior = c.prior
 		l.logger.Debugf("starting bayesian evaluation with prior: %v", c.posterior)
 		for _, bevent := range c.bayesianEventArray {
 			err := bevent.bayesianUpdate(c, msg, l)
 			if err != nil {
 				l.logger.Errorf("bayesian update failed for %s with %s", bevent.rawCondition.ConditionalFilterName, err)
 			}
 		}
 		l.logger.Debugf("value of posterior after events : %v", c.posterior)
 		if c.posterior > c.threshold {
 			l.logger.Debugf("Bayesian bucket overflow")
 			l.Ovflw_ts = l.Last_ts
 			l.Out <- l.Queue
 			return nil
 		}
 		return &msg
 	}
 }
 func (b *BayesianEvent) bayesianUpdate(c *BayesianBucket, msg types.Event, l *Leaky) error {
 	var condition, ok bool
 	if b.conditionalFilterRuntime == nil {
 		l.logger.Tracef("empty conditional filter runtime for %s", b.rawCondition.ConditionalFilterName)
 		return nil
 	}
 	l.logger.Tracef("guillotine value for %s :  %v", b.rawCondition.ConditionalFilterName, b.getGuillotineState())
 	if b.getGuillotineState() {
 		l.logger.Tracef("guillotine already triggered for %s", b.rawCondition.ConditionalFilterName)
 		l.logger.Tracef("condition true updating prior for: %s", b.rawCondition.ConditionalFilterName)
 		c.posterior = updateProbability(c.posterior, b.rawCondition.ProbGivenEvil, b.rawCondition.ProbGivenBenign)
 		l.logger.Tracef("new value of posterior : %v", c.posterior)
 		return nil
 	}
 	l.logger.Debugf("running condition expression: %s", b.rawCondition.ConditionalFilterName)
 	ret, err := expr.Run(b.conditionalFilterRuntime, map[string]interface{}{"evt": &msg, "queue": l.Queue, "leaky": l})
 	if err != nil {
 		return fmt.Errorf("unable to run conditional filter: %s", err)
 	}
 	l.logger.Tracef("bayesian bucket expression %s returned : %v", b.rawCondition.ConditionalFilterName, ret)
 	if condition, ok = ret.(bool); !ok {
 		return fmt.Errorf("bayesian condition unexpected non-bool return: %T", ret)
 	}
 	l.logger.Tracef("condition %T updating prior for: %s", condition, b.rawCondition.ConditionalFilterName)
 	if condition {
 		c.posterior = updateProbability(c.posterior, b.rawCondition.ProbGivenEvil, b.rawCondition.ProbGivenBenign)
 		b.triggerGuillotine()
 	} else {
 		c.posterior = updateProbability(c.posterior, 1-b.rawCondition.ProbGivenEvil, 1-b.rawCondition.ProbGivenBenign)
 	}
 	l.logger.Tracef("new value of posterior: %v", c.posterior)
 	return nil
 }
 func (b *BayesianEvent) getGuillotineState() bool {
 	if b.rawCondition.Guillotine {
 		return b.guillotineState
 	}
 	return false
 }
 func (b *BayesianEvent) triggerGuillotine() {
 	b.guillotineState = true
 }
 func (b *BayesianEvent) compileCondition() error {
 	var err error
 	var compiledExpr *vm.Program
 	if compiled, ok := conditionalExprCache[b.rawCondition.ConditionalFilterName]; ok {
 		b.conditionalFilterRuntime = &compiled
 		return nil
 	}
 	conditionalExprCacheLock.Unlock()
 	//release the lock during compile same as coditional bucket
 	compiledExpr, err = expr.Compile(b.rawCondition.ConditionalFilterName, exprhelpers.GetExprOptions(map[string]interface{}{"queue": &Queue{}, "leaky": &Leaky{}, "evt": &types.Event{}})...)
 	if err != nil {
 		return fmt.Errorf("bayesian condition compile error: %w", err)
 	}
 	b.conditionalFilterRuntime = compiledExpr
 	conditionalExprCacheLock.Lock()
 	conditionalExprCache[b.rawCondition.ConditionalFilterName] = *compiledExpr
 	return nil
 }
--- a/pkg/leakybucket/bucket.go
+++ b/pkg/leakybucket/bucket.go
@ -191,6 +191,10 @@ func FromFactory(bucketFactory BucketFactory) *Leaky {
 		l.conditionalOverflow = true
 		l.Duration = l.BucketConfig.leakspeed
 	}
 	if l.BucketConfig.Type == "bayesian" {
 		l.Duration = l.BucketConfig.leakspeed
 	}
 	return l
 }
--- a/pkg/leakybucket/manager_load.go
+++ b/pkg/leakybucket/manager_load.go
@ -51,6 +51,9 @@ type BucketFactory struct {
 	Profiling           bool                      `yaml:"profiling"`           //Profiling, if true, will make the bucket record pours/overflows/etc.
 	OverflowFilter      string                    `yaml:"overflow_filter"`     //OverflowFilter if present, is a filter that must return true for the overflow to go through
 	ConditionalOverflow string                    `yaml:"condition"`           //condition if present, is an expression that must return true for the bucket to overflow
 	BayesianPrior       float32                   `yaml:"bayesian_prior"`
 	BayesianThreshold   float32                   `yaml:"bayesian_threshold"`
 	BayesianConditions  []RawBayesianCondition    `yaml:"bayesian_conditions"` //conditions for the bayesian bucket
 	ScopeType           types.ScopeType           `yaml:"scope,omitempty"`     //to enforce a different remediation than blocking an IP. Will default this to IP
 	BucketName          string                    `yaml:"-"`
 	Filename            string                    `yaml:"-"`
@ -120,6 +123,25 @@ func ValidateFactory(bucketFactory *BucketFactory) error {
 		if bucketFactory.leakspeed == 0 {
 			return fmt.Errorf("bad leakspeed for conditional bucket '%s'", bucketFactory.LeakSpeed)
 		}
 	} else if bucketFactory.Type == "bayesian" {
 		if bucketFactory.BayesianConditions == nil {
 			return fmt.Errorf("bayesian bucket must have bayesian conditions")
 		}
 		if bucketFactory.BayesianPrior == 0 {
 			return fmt.Errorf("bayesian bucket must have a valid, non-zero prior")
 		}
 		if bucketFactory.BayesianThreshold == 0 {
 			return fmt.Errorf("bayesian bucket must have a valid, non-zero threshold")
 		}
 		if bucketFactory.BayesianPrior > 1 {
 			return fmt.Errorf("bayesian bucket must have a valid, non-zero prior")
 		}
 		if bucketFactory.BayesianThreshold > 1 {
 			return fmt.Errorf("bayesian bucket must have a valid, non-zero threshold")
 		}
 		if bucketFactory.Capacity != -1 {
 			return fmt.Errorf("bayesian bucket must have capacity -1")
 		}
 	} else {
 		return fmt.Errorf("unknown bucket type '%s'", bucketFactory.Type)
 	}
@ -316,6 +338,8 @@ func LoadBucket(bucketFactory *BucketFactory, tomb *tomb.Tomb) error {
 		bucketFactory.processors = append(bucketFactory.processors, &DumbProcessor{})
 	case "conditional":
 		bucketFactory.processors = append(bucketFactory.processors, &DumbProcessor{})
 	case "bayesian":
 		bucketFactory.processors = append(bucketFactory.processors, &DumbProcessor{})
 	default:
 		return fmt.Errorf("invalid type '%s' in %s : %v", bucketFactory.Type, bucketFactory.Filename, err)
 	}
@ -355,6 +379,11 @@ func LoadBucket(bucketFactory *BucketFactory, tomb *tomb.Tomb) error {
 		bucketFactory.processors = append(bucketFactory.processors, &ConditionalOverflow{})
 	}
 	if bucketFactory.BayesianThreshold != 0 {
 		bucketFactory.logger.Tracef("Adding bayesian processor")
 		bucketFactory.processors = append(bucketFactory.processors, &BayesianBucket{})
 	}
 	if len(bucketFactory.Data) > 0 {
 		for _, data := range bucketFactory.Data {
 			if data.DestPath == "" {
--- a/pkg/leakybucket/manager_load_test.go
+++ b/pkg/leakybucket/manager_load_test.go
@ -119,3 +119,25 @@ func TestCounterBucketsConfig(t *testing.T) {
 	}
 }
 func TestBayesianBucketsConfig(t *testing.T) {
 	var CfgTests = []cfgTest{
 		//basic valid counter
 		{BucketFactory{Name: "test", Description: "test1", Type: "bayesian", Capacity: -1, Filter: "true", BayesianPrior: 0.5, BayesianThreshold: 0.5, BayesianConditions: []RawBayesianCondition{{ConditionalFilterName: "true", ProbGivenEvil: 0.5, ProbGivenBenign: 0.5}}}, true, true},
 		//bad capacity
 		{BucketFactory{Name: "test", Description: "test1", Type: "bayesian", Capacity: 1, Filter: "true", BayesianPrior: 0.5, BayesianThreshold: 0.5, BayesianConditions: []RawBayesianCondition{{ConditionalFilterName: "true", ProbGivenEvil: 0.5, ProbGivenBenign: 0.5}}}, false, false},
 		//missing prior
 		{BucketFactory{Name: "test", Description: "test1", Type: "bayesian", Capacity: -1, Filter: "true", BayesianThreshold: 0.5, BayesianConditions: []RawBayesianCondition{{ConditionalFilterName: "true", ProbGivenEvil: 0.5, ProbGivenBenign: 0.5}}}, false, false},
 		//missing threshold
 		{BucketFactory{Name: "test", Description: "test1", Type: "bayesian", Capacity: -1, Filter: "true", BayesianPrior: 0.5, BayesianConditions: []RawBayesianCondition{{ConditionalFilterName: "true", ProbGivenEvil: 0.5, ProbGivenBenign: 0.5}}}, false, false},
 		//bad prior
 		{BucketFactory{Name: "test", Description: "test1", Type: "bayesian", Capacity: -1, Filter: "true", BayesianPrior: 1.5, BayesianThreshold: 0.5, BayesianConditions: []RawBayesianCondition{{ConditionalFilterName: "true", ProbGivenEvil: 0.5, ProbGivenBenign: 0.5}}}, false, false},
 		//bad threshold
 		{BucketFactory{Name: "test", Description: "test1", Type: "bayesian", Capacity: -1, Filter: "true", BayesianPrior: 0.5, BayesianThreshold: 1.5, BayesianConditions: []RawBayesianCondition{{ConditionalFilterName: "true", ProbGivenEvil: 0.5, ProbGivenBenign: 0.5}}}, false, false},
 	}
 	if err := runTest(CfgTests); err != nil {
 		t.Fatalf("%s", err)
 	}
 }
--- a/pkg/leakybucket/tests/guillotine-bayesian-bucket/bucket.yaml
+++ b/pkg/leakybucket/tests/guillotine-bayesian-bucket/bucket.yaml
@ -0,0 +1,21 @@
 type: bayesian
 name: test/guillotine-bayesian
 debug: true
 description: "bayesian bucket"
 filter: "evt.Meta.log_type == 'http_access-log' || evt.Meta.log_type == 'ssh_access-log'"
 groupby: evt.Meta.source_ip
 bayesian_prior: 0.5
 bayesian_threshold: 0.8
 bayesian_conditions:
 - condition: evt.Meta.http_path == "/"
  prob_given_evil: 0.8
  prob_given_benign: 0.2
  guillotine : true
 - condition: evt.Meta.ssh_user == "admin"
  prob_given_evil: 0.9
  prob_given_benign: 0.5
  guillotine : true
 leakspeed: 30s
 capacity: -1
 labels:
  type: overflow_1
--- a/pkg/leakybucket/tests/guillotine-bayesian-bucket/scenarios.yaml
+++ b/pkg/leakybucket/tests/guillotine-bayesian-bucket/scenarios.yaml
@ -0,0 +1 @@
 - filename: {{.TestDirectory}}/bucket.yaml
--- a/pkg/leakybucket/tests/guillotine-bayesian-bucket/test.json
+++ b/pkg/leakybucket/tests/guillotine-bayesian-bucket/test.json
@ -0,0 +1,50 @@
 {
 	"lines": [
 	   {
 		  "Line": {
 			 "Labels": {
 				"type": "nginx"
 			 },
 			 "Raw": "don't care"
 		  },
 		  "MarshaledTime": "2020-01-01T10:00:00.000Z",
 		  "Meta": {
 			 "source_ip": "2a00:1450:4007:816::200e",
 			 "log_type": "http_access-log",
 			 "http_path": "/"
 		  }
 	   },
 	   {
 		"Line": {
 		   "Labels": {
 			  "type": "nginx"
 		   },
 		   "Raw": "don't care"
 		},
 		"MarshaledTime": "2020-01-01T10:00:00.000Z",
 		"Meta": {
 		   "source_ip": "2a00:1450:4007:816::200e",
 		   "log_type": "ssh_access-log",
 		   "ssh_user": "admin"
 		}
 	 }
 	],
 	"results": [
 	  {
 		"Type" : 1,
 		"Alert": {
 		  "sources" : {
 			"2a00:1450:4007:816::200e": {
 			  "ip": "2a00:1450:4007:816::200e",
 			  "scope": "Ip",
 			  "value": "2a00:1450:4007:816::200e"
 			}
 		  },
 		  "Alert" : {
 			"scenario": "test/guillotine-bayesian",
 			"events_count": 2
 		  }
 		}
 	  }
 	]
  }
--- a/pkg/leakybucket/tests/multiple-bayesian-bucket/bucket.yaml
+++ b/pkg/leakybucket/tests/multiple-bayesian-bucket/bucket.yaml
@ -0,0 +1,21 @@
 type: bayesian
 name: test/multiple-bayesian
 debug: true
 description: "bayesian bucket"
 filter: "evt.Meta.log_type == 'http_access-log' || evt.Meta.log_type == 'ssh_access-log'"
 groupby: evt.Meta.source_ip
 bayesian_prior: 0.5
 bayesian_threshold: 0.8
 bayesian_conditions:
 - condition: evt.Meta.http_path == "/"
  prob_given_evil: 0.8
  prob_given_benign: 0.2
  guillotine : true
 - condition: evt.Meta.ssh_user == "admin"
  prob_given_evil: 0.9
  prob_given_benign: 0.5
  guillotine : true
 leakspeed: 30s
 capacity: -1
 labels:
  type: overflow_1
--- a/pkg/leakybucket/tests/multiple-bayesian-bucket/scenarios.yaml
+++ b/pkg/leakybucket/tests/multiple-bayesian-bucket/scenarios.yaml
@ -0,0 +1 @@
 - filename: {{.TestDirectory}}/bucket.yaml
--- a/pkg/leakybucket/tests/multiple-bayesian-bucket/test.json
+++ b/pkg/leakybucket/tests/multiple-bayesian-bucket/test.json
@ -0,0 +1,64 @@
 {
 	"lines": [
 	   {
 		  "Line": {
 			 "Labels": {
 				"type": "nginx"
 			 },
 			 "Raw": "don't care"
 		  },
 		  "MarshaledTime": "2020-01-01T10:00:00.000Z",
 		  "Meta": {
 			 "source_ip": "2a00:1450:4007:816::200e",
 			 "log_type": "http_access-log",
 			 "http_path": "/"
 		  }
 	   },
 	   {
 		"Line": {
 		   "Labels": {
 			  "type": "nginx"
 		   },
 		   "Raw": "don't care"
 		},
 		"MarshaledTime": "2020-01-01T10:00:00.000Z",
 		"Meta": {
 		   "source_ip": "1.2.3.4",
 		   "log_type": "ssh_access-log",
 		   "ssh_user": "admin"
 		}
 	 },
 	 {
 	  "Line": {
 		 "Labels": {
 			"type": "nginx"
 		 },
 		 "Raw": "don't care"
 	  },
 	  "MarshaledTime": "2020-01-01T10:00:00.000Z",
 	  "Meta": {
 		 "source_ip": "2a00:1450:4007:816::200e",
 		 "log_type": "ssh_access-log",
 		 "ssh_user": "admin"
 	  }
   }
 	],
 	"results": [
 	  {
 		"Type" : 1,
 		"Alert": {
 		  "sources" : {
 			"2a00:1450:4007:816::200e": {
 			  "ip": "2a00:1450:4007:816::200e",
 			  "scope": "Ip",
 			  "value": "2a00:1450:4007:816::200e"
 			}
 		  },
 		  "Alert" : {
 			"scenario": "test/multiple-bayesian",
 			"events_count": 2
 		  }
 		}
 	  }
 	]
  }
--- a/pkg/leakybucket/tests/simple-bayesian-bucket/bucket.yaml
+++ b/pkg/leakybucket/tests/simple-bayesian-bucket/bucket.yaml
@ -0,0 +1,19 @@
 type: bayesian
 name: test/simple-bayesian
 debug: true
 description: "bayesian bucket"
 filter: "evt.Meta.log_type == 'http_access-log' || evt.Meta.log_type == 'ssh_access-log'"
 groupby: evt.Meta.source_ip
 bayesian_prior: 0.5
 bayesian_threshold: 0.8
 bayesian_conditions:
 - condition: any(queue.Queue, {.Meta.http_path == "/"})
  prob_given_evil: 0.8
  prob_given_benign: 0.2
 - condition: any(queue.Queue, {.Meta.ssh_user == "admin"})
  prob_given_evil: 0.9
  prob_given_benign: 0.5
 leakspeed: 30s
 capacity: -1
 labels:
  type: overflow_1
--- a/pkg/leakybucket/tests/simple-bayesian-bucket/scenarios.yaml
+++ b/pkg/leakybucket/tests/simple-bayesian-bucket/scenarios.yaml
@ -0,0 +1 @@
 - filename: {{.TestDirectory}}/bucket.yaml
--- a/pkg/leakybucket/tests/simple-bayesian-bucket/test.json
+++ b/pkg/leakybucket/tests/simple-bayesian-bucket/test.json
@ -0,0 +1,50 @@
 {
 	"lines": [
 	   {
 		  "Line": {
 			 "Labels": {
 				"type": "nginx"
 			 },
 			 "Raw": "don't care"
 		  },
 		  "MarshaledTime": "2020-01-01T10:00:00.000Z",
 		  "Meta": {
 			 "source_ip": "2a00:1450:4007:816::200e",
 			 "log_type": "http_access-log",
 			 "http_path": "/"
 		  }
 	   },
 	   {
 		"Line": {
 		   "Labels": {
 			  "type": "nginx"
 		   },
 		   "Raw": "don't care"
 		},
 		"MarshaledTime": "2020-01-01T10:00:00.000Z",
 		"Meta": {
 		   "source_ip": "2a00:1450:4007:816::200e",
 		   "log_type": "ssh_access-log",
 		   "ssh_user": "admin"
 		}
 	 }
 	],
 	"results": [
 	  {
 		"Type" : 1,
 		"Alert": {
 		  "sources" : {
 			"2a00:1450:4007:816::200e": {
 			  "ip": "2a00:1450:4007:816::200e",
 			  "scope": "Ip",
 			  "value": "2a00:1450:4007:816::200e"
 			}
 		  },
 		  "Alert" : {
 			"scenario": "test/simple-bayesian",
 			"events_count": 2
 		  }
 		}
 	  }
 	]
  }