Add alerts and decisions metrics, LAPI and agent timing prom metrics (#1546)

This commit is contained in:
Thibault "bui" Koechlin 2022-06-22 11:14:34 +02:00 committed by GitHub
parent 0a39066f9d
commit a6ed08b239
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 265 additions and 12 deletions

View file

@ -124,6 +124,8 @@ func ShowPrometheus(url string) {
lapi_stats := map[string]map[string]int{} lapi_stats := map[string]map[string]int{}
lapi_machine_stats := map[string]map[string]map[string]int{} lapi_machine_stats := map[string]map[string]map[string]int{}
lapi_bouncer_stats := map[string]map[string]map[string]int{} lapi_bouncer_stats := map[string]map[string]map[string]int{}
decisions_stats := map[string]map[string]map[string]int{}
alerts_stats := map[string]int{}
for idx, fam := range result { for idx, fam := range result {
if !strings.HasPrefix(fam.Name, "cs_") { if !strings.HasPrefix(fam.Name, "cs_") {
@ -131,7 +133,11 @@ func ShowPrometheus(url string) {
} }
log.Tracef("round %d", idx) log.Tracef("round %d", idx)
for _, m := range fam.Metrics { for _, m := range fam.Metrics {
metric := m.(prom2json.Metric) metric, ok := m.(prom2json.Metric)
if !ok {
log.Debugf("failed to convert metric to prom2json.Metric")
continue
}
name, ok := metric.Labels["name"] name, ok := metric.Labels["name"]
if !ok { if !ok {
log.Debugf("no name in Metric %v", metric.Labels) log.Debugf("no name in Metric %v", metric.Labels)
@ -152,6 +158,10 @@ func ShowPrometheus(url string) {
route := metric.Labels["route"] route := metric.Labels["route"]
method := metric.Labels["method"] method := metric.Labels["method"]
reason := metric.Labels["reason"]
origin := metric.Labels["origin"]
action := metric.Labels["action"]
fval, err := strconv.ParseFloat(value, 32) fval, err := strconv.ParseFloat(value, 32)
if err != nil { if err != nil {
log.Errorf("Unexpected int value %s : %s", value, err) log.Errorf("Unexpected int value %s : %s", value, err)
@ -254,6 +264,19 @@ func ShowPrometheus(url string) {
x.NonEmpty += ival x.NonEmpty += ival
} }
lapi_decisions_stats[bouncer] = x lapi_decisions_stats[bouncer] = x
case "cs_active_decisions":
if _, ok := decisions_stats[reason]; !ok {
decisions_stats[reason] = make(map[string]map[string]int)
}
if _, ok := decisions_stats[reason][origin]; !ok {
decisions_stats[reason][origin] = make(map[string]int)
}
decisions_stats[reason][origin][action] += ival
case "cs_alerts":
/*if _, ok := alerts_stats[scenario]; !ok {
alerts_stats[scenario] = make(map[string]int)
}*/
alerts_stats[reason] += ival
default: default:
continue continue
} }
@ -329,6 +352,30 @@ func ShowPrometheus(url string) {
} }
} }
decisionsTable := tablewriter.NewWriter(os.Stdout)
decisionsTable.SetHeader([]string{"Reason", "Origin", "Action", "Count"})
for reason, origins := range decisions_stats {
for origin, actions := range origins {
for action, hits := range actions {
row := []string{}
row = append(row, reason)
row = append(row, origin)
row = append(row, action)
row = append(row, fmt.Sprintf("%d", hits))
decisionsTable.Append(row)
}
}
}
alertsTable := tablewriter.NewWriter(os.Stdout)
alertsTable.SetHeader([]string{"Reason", "Count"})
for scenario, hits := range alerts_stats {
row := []string{}
row = append(row, scenario)
row = append(row, fmt.Sprintf("%d", hits))
alertsTable.Append(row)
}
if bucketsTable.NumLines() > 0 { if bucketsTable.NumLines() > 0 {
log.Printf("Buckets Metrics:") log.Printf("Buckets Metrics:")
bucketsTable.SetAlignment(tablewriter.ALIGN_LEFT) bucketsTable.SetAlignment(tablewriter.ALIGN_LEFT)
@ -366,8 +413,20 @@ func ShowPrometheus(url string) {
lapiDecisionsTable.Render() lapiDecisionsTable.Render()
} }
if decisionsTable.NumLines() > 0 {
log.Printf("Local Api Decisions:")
decisionsTable.SetAlignment(tablewriter.ALIGN_LEFT)
decisionsTable.Render()
}
if alertsTable.NumLines() > 0 {
log.Printf("Local Api Alerts:")
alertsTable.SetAlignment(tablewriter.ALIGN_LEFT)
alertsTable.Render()
}
} else if csConfig.Cscli.Output == "json" { } else if csConfig.Cscli.Output == "json" {
for _, val := range []interface{}{acquis_stats, parsers_stats, buckets_stats, lapi_stats, lapi_bouncer_stats, lapi_machine_stats, lapi_decisions_stats} { for _, val := range []interface{}{acquis_stats, parsers_stats, buckets_stats, lapi_stats, lapi_bouncer_stats, lapi_machine_stats, lapi_decisions_stats, decisions_stats, alerts_stats} {
x, err := json.MarshalIndent(val, "", " ") x, err := json.MarshalIndent(val, "", " ")
if err != nil { if err != nil {
log.Fatalf("failed to unmarshal metrics : %v", err) log.Fatalf("failed to unmarshal metrics : %v", err)
@ -375,7 +434,7 @@ func ShowPrometheus(url string) {
fmt.Printf("%s\n", string(x)) fmt.Printf("%s\n", string(x))
} }
} else if csConfig.Cscli.Output == "raw" { } else if csConfig.Cscli.Output == "raw" {
for _, val := range []interface{}{acquis_stats, parsers_stats, buckets_stats, lapi_stats, lapi_bouncer_stats, lapi_machine_stats, lapi_decisions_stats} { for _, val := range []interface{}{acquis_stats, parsers_stats, buckets_stats, lapi_stats, lapi_bouncer_stats, lapi_machine_stats, lapi_decisions_stats, decisions_stats, alerts_stats} {
x, err := yaml.Marshal(val) x, err := yaml.Marshal(val)
if err != nil { if err != nil {
log.Fatalf("failed to unmarshal metrics : %v", err) log.Fatalf("failed to unmarshal metrics : %v", err)

View file

@ -6,6 +6,7 @@ import (
v1 "github.com/crowdsecurity/crowdsec/pkg/apiserver/controllers/v1" v1 "github.com/crowdsecurity/crowdsec/pkg/apiserver/controllers/v1"
"github.com/crowdsecurity/crowdsec/pkg/csconfig" "github.com/crowdsecurity/crowdsec/pkg/csconfig"
"github.com/crowdsecurity/crowdsec/pkg/cwversion" "github.com/crowdsecurity/crowdsec/pkg/cwversion"
"github.com/crowdsecurity/crowdsec/pkg/database"
leaky "github.com/crowdsecurity/crowdsec/pkg/leakybucket" leaky "github.com/crowdsecurity/crowdsec/pkg/leakybucket"
"github.com/crowdsecurity/crowdsec/pkg/parser" "github.com/crowdsecurity/crowdsec/pkg/parser"
"github.com/crowdsecurity/crowdsec/pkg/types" "github.com/crowdsecurity/crowdsec/pkg/types"
@ -62,6 +63,81 @@ var globalCsInfo = prometheus.NewGauge(
}, },
) )
var globalActiveDecisions = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cs_active_decisions",
Help: "Number of active decisions.",
},
[]string{"reason", "origin", "action"},
)
var globalAlerts = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cs_alerts",
Help: "Number of alerts (excluding CAPI).",
},
[]string{"reason"},
)
var globalParsingHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Help: "Time spent parsing a line",
Name: "cs_parsing_time_seconds",
Buckets: []float64{0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.004, 0.005, 0.0075, 0.01},
},
[]string{"type", "source"},
)
var globalPourHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "cs_bucket_pour_seconds",
Help: "Time spent pouring an event to buckets.",
Buckets: []float64{0.001, 0.002, 0.005, 0.01, 0.015, 0.02, 0.03, 0.04, 0.05},
},
[]string{"type", "source"},
)
func computeDynamicMetrics(next http.Handler, dbClient *database.Client) http.HandlerFunc {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if dbClient == nil {
next.ServeHTTP(w, r)
return
}
decisionsFilters := make(map[string][]string, 0)
decisions, err := dbClient.QueryDecisionCountByScenario(decisionsFilters)
if err != nil {
log.Errorf("Error querying decisions for metrics: %v", err)
next.ServeHTTP(w, r)
return
}
globalActiveDecisions.Reset()
for _, d := range decisions {
globalActiveDecisions.With(prometheus.Labels{"reason": d.Scenario, "origin": d.Origin, "action": d.Type}).Set(float64(d.Count))
}
globalAlerts.Reset()
alertsFilter := map[string][]string{
"include_capi": {"false"},
}
alerts, err := dbClient.AlertsCountPerScenario(alertsFilter)
if err != nil {
log.Errorf("Error querying alerts for metrics: %v", err)
next.ServeHTTP(w, r)
return
}
for k, v := range alerts {
globalAlerts.With(prometheus.Labels{"reason": k}).Set(float64(v))
}
next.ServeHTTP(w, r)
})
}
func registerPrometheus(config *csconfig.PrometheusCfg) { func registerPrometheus(config *csconfig.PrometheusCfg) {
if !config.Enabled { if !config.Enabled {
return return
@ -75,13 +151,12 @@ func registerPrometheus(config *csconfig.PrometheusCfg) {
config.ListenPort = 6060 config.ListenPort = 6060
} }
defer types.CatchPanic("crowdsec/registerPrometheus")
/*Registering prometheus*/ /*Registering prometheus*/
/*If in aggregated mode, do not register events associated to a source, keeps cardinality low*/ /*If in aggregated mode, do not register events associated to a source, keeps cardinality low*/
if config.Level == "aggregated" { if config.Level == "aggregated" {
log.Infof("Loading aggregated prometheus collectors") log.Infof("Loading aggregated prometheus collectors")
prometheus.MustRegister(globalParserHits, globalParserHitsOk, globalParserHitsKo, prometheus.MustRegister(globalParserHits, globalParserHitsOk, globalParserHitsKo,
globalCsInfo, globalCsInfo, globalParsingHistogram, globalPourHistogram,
leaky.BucketsUnderflow, leaky.BucketsCanceled, leaky.BucketsInstanciation, leaky.BucketsOverflow, leaky.BucketsUnderflow, leaky.BucketsCanceled, leaky.BucketsInstanciation, leaky.BucketsOverflow,
v1.LapiRouteHits, v1.LapiRouteHits,
leaky.BucketsCurrentCount) leaky.BucketsCurrentCount)
@ -89,12 +164,22 @@ func registerPrometheus(config *csconfig.PrometheusCfg) {
log.Infof("Loading prometheus collectors") log.Infof("Loading prometheus collectors")
prometheus.MustRegister(globalParserHits, globalParserHitsOk, globalParserHitsKo, prometheus.MustRegister(globalParserHits, globalParserHitsOk, globalParserHitsKo,
parser.NodesHits, parser.NodesHitsOk, parser.NodesHitsKo, parser.NodesHits, parser.NodesHitsOk, parser.NodesHitsKo,
globalCsInfo, globalCsInfo, globalParsingHistogram, globalPourHistogram,
v1.LapiRouteHits, v1.LapiMachineHits, v1.LapiBouncerHits, v1.LapiNilDecisions, v1.LapiNonNilDecisions, v1.LapiRouteHits, v1.LapiMachineHits, v1.LapiBouncerHits, v1.LapiNilDecisions, v1.LapiNonNilDecisions, v1.LapiResponseTime,
leaky.BucketsPour, leaky.BucketsUnderflow, leaky.BucketsCanceled, leaky.BucketsInstanciation, leaky.BucketsOverflow, leaky.BucketsCurrentCount) leaky.BucketsPour, leaky.BucketsUnderflow, leaky.BucketsCanceled, leaky.BucketsInstanciation, leaky.BucketsOverflow, leaky.BucketsCurrentCount,
globalActiveDecisions, globalAlerts)
} }
http.Handle("/metrics", promhttp.Handler()) }
func servePrometheus(config *csconfig.PrometheusCfg, dbClient *database.Client) {
if !config.Enabled {
return
}
defer types.CatchPanic("crowdsec/servePrometheus")
http.Handle("/metrics", computeDynamicMetrics(promhttp.Handler(), dbClient))
if err := http.ListenAndServe(fmt.Sprintf("%s:%d", config.ListenAddr, config.ListenPort), nil); err != nil { if err := http.ListenAndServe(fmt.Sprintf("%s:%d", config.ListenAddr, config.ListenPort), nil); err != nil {
log.Warningf("prometheus: %s", err) log.Warningf("prometheus: %s", err)
} }

View file

@ -1,6 +1,8 @@
package main package main
import ( import (
"time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
@ -26,11 +28,14 @@ LOOP:
} }
globalParserHits.With(prometheus.Labels{"source": event.Line.Src, "type": event.Line.Module}).Inc() globalParserHits.With(prometheus.Labels{"source": event.Line.Src, "type": event.Line.Module}).Inc()
startParsing := time.Now()
/* parse the log using magic */ /* parse the log using magic */
parsed, err := parser.Parse(parserCTX, event, nodes) parsed, err := parser.Parse(parserCTX, event, nodes)
if err != nil { if err != nil {
log.Errorf("failed parsing : %v\n", err) log.Errorf("failed parsing : %v\n", err)
} }
elapsed := time.Since(startParsing)
globalParsingHistogram.With(prometheus.Labels{"source": event.Line.Src, "type": event.Line.Module}).Observe(elapsed.Seconds())
if !parsed.Process { if !parsed.Process {
globalParserHitsKo.With(prometheus.Labels{"source": event.Line.Src, "type": event.Line.Module}).Inc() globalParserHitsKo.With(prometheus.Labels{"source": event.Line.Src, "type": event.Line.Module}).Inc()
log.Debugf("Discarding line %+v", parsed) log.Debugf("Discarding line %+v", parsed)

View file

@ -7,6 +7,7 @@ import (
"github.com/crowdsecurity/crowdsec/pkg/csconfig" "github.com/crowdsecurity/crowdsec/pkg/csconfig"
leaky "github.com/crowdsecurity/crowdsec/pkg/leakybucket" leaky "github.com/crowdsecurity/crowdsec/pkg/leakybucket"
"github.com/crowdsecurity/crowdsec/pkg/types" "github.com/crowdsecurity/crowdsec/pkg/types"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
@ -21,6 +22,7 @@ func runPour(input chan types.Event, holders []leaky.BucketFactory, buckets *lea
log.Infof("Bucket routine exiting") log.Infof("Bucket routine exiting")
return nil return nil
case parsed := <-input: case parsed := <-input:
startTime := time.Now()
count++ count++
if count%5000 == 0 { if count%5000 == 0 {
log.Infof("%d existing buckets", leaky.LeakyRoutineCount) log.Infof("%d existing buckets", leaky.LeakyRoutineCount)
@ -45,6 +47,8 @@ func runPour(input chan types.Event, holders []leaky.BucketFactory, buckets *lea
log.Errorf("bucketify failed for: %v", parsed) log.Errorf("bucketify failed for: %v", parsed)
return fmt.Errorf("process of event failed : %v", err) return fmt.Errorf("process of event failed : %v", err)
} }
elapsed := time.Since(startTime)
globalPourHistogram.With(prometheus.Labels{"type": parsed.Line.Module, "source": parsed.Line.Src}).Observe(elapsed.Seconds())
if poured { if poured {
globalBucketPourOk.Inc() globalBucketPourOk.Inc()
} else { } else {

View file

@ -8,6 +8,7 @@ import (
"github.com/crowdsecurity/crowdsec/pkg/csconfig" "github.com/crowdsecurity/crowdsec/pkg/csconfig"
"github.com/crowdsecurity/crowdsec/pkg/cwversion" "github.com/crowdsecurity/crowdsec/pkg/cwversion"
"github.com/crowdsecurity/crowdsec/pkg/database"
"github.com/crowdsecurity/crowdsec/pkg/types" "github.com/crowdsecurity/crowdsec/pkg/types"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"github.com/sirupsen/logrus/hooks/writer" "github.com/sirupsen/logrus/hooks/writer"
@ -48,7 +49,18 @@ func StartRunSvc() error {
// Enable profiling early // Enable profiling early
if cConfig.Prometheus != nil { if cConfig.Prometheus != nil {
go registerPrometheus(cConfig.Prometheus) var dbClient *database.Client
var err error
if cConfig.DbConfig != nil {
dbClient, err = database.NewClient(cConfig.DbConfig)
if err != nil {
log.Fatalf("unable to create database client: %s", err)
}
}
registerPrometheus(cConfig.Prometheus)
go servePrometheus(cConfig.Prometheus, dbClient)
} }
return Serve(cConfig) return Serve(cConfig)
} }

View file

@ -6,6 +6,7 @@ import (
"github.com/crowdsecurity/crowdsec/pkg/csconfig" "github.com/crowdsecurity/crowdsec/pkg/csconfig"
"github.com/crowdsecurity/crowdsec/pkg/cwversion" "github.com/crowdsecurity/crowdsec/pkg/cwversion"
"github.com/crowdsecurity/crowdsec/pkg/database"
"github.com/crowdsecurity/crowdsec/pkg/types" "github.com/crowdsecurity/crowdsec/pkg/types"
"github.com/pkg/errors" "github.com/pkg/errors"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
@ -88,7 +89,18 @@ func WindowsRun() error {
// Enable profiling early // Enable profiling early
if cConfig.Prometheus != nil { if cConfig.Prometheus != nil {
go registerPrometheus(cConfig.Prometheus) var dbClient *database.Client
var err error
if cConfig.DbConfig != nil {
dbClient, err = database.NewClient(cConfig.DbConfig)
if err != nil {
log.Fatalf("unable to create database client: %s", err)
}
}
registerPrometheus(cConfig.Prometheus)
go servePrometheus(cConfig.Prometheus, dbClient)
} }
return Serve(cConfig) return Serve(cConfig)
} }

View file

@ -1,6 +1,8 @@
package v1 package v1
import ( import (
"time"
jwt "github.com/appleboy/gin-jwt/v2" jwt "github.com/appleboy/gin-jwt/v2"
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
@ -52,6 +54,14 @@ var LapiNonNilDecisions = prometheus.NewCounterVec(
[]string{"bouncer"}, []string{"bouncer"},
) )
var LapiResponseTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "cs_lapi_request_duration_seconds",
Help: "Response time of LAPI",
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1},
},
[]string{"endpoint", "method"})
func PrometheusBouncersHasEmptyDecision(c *gin.Context) { func PrometheusBouncersHasEmptyDecision(c *gin.Context) {
name, ok := c.Get("BOUNCER_NAME") name, ok := c.Get("BOUNCER_NAME")
if ok { if ok {
@ -99,9 +109,12 @@ func PrometheusBouncersMiddleware() gin.HandlerFunc {
func PrometheusMiddleware() gin.HandlerFunc { func PrometheusMiddleware() gin.HandlerFunc {
return func(c *gin.Context) { return func(c *gin.Context) {
startTime := time.Now()
LapiRouteHits.With(prometheus.Labels{ LapiRouteHits.With(prometheus.Labels{
"route": c.Request.URL.Path, "route": c.Request.URL.Path,
"method": c.Request.Method}).Inc() "method": c.Request.Method}).Inc()
c.Next() c.Next()
elapsed := time.Since(startTime)
LapiResponseTime.With(prometheus.Labels{"method": c.Request.Method, "endpoint": c.Request.URL.Path}).Observe(elapsed.Seconds())
} }
} }

View file

@ -1,6 +1,7 @@
package database package database
import ( import (
"context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"strconv" "strconv"
@ -722,6 +723,38 @@ func BuildAlertRequestFromFilter(alerts *ent.AlertQuery, filter map[string][]str
return alerts, nil return alerts, nil
} }
func (c *Client) AlertsCountPerScenario(filters map[string][]string) (map[string]int, error) {
var res []struct {
Scenario string
Count int
}
ctx := context.Background()
query := c.Ent.Alert.Query()
query, err := BuildAlertRequestFromFilter(query, filters)
if err != nil {
return nil, errors.Wrap(err, "failed to build alert request")
}
err = query.GroupBy(alert.FieldScenario).Aggregate(ent.Count()).Scan(ctx, &res)
if err != nil {
return nil, errors.Wrap(err, "failed to count alerts per scenario")
}
counts := make(map[string]int)
for _, r := range res {
counts[r.Scenario] = r.Count
}
return counts, nil
}
func (c *Client) TotalAlerts() (int, error) { func (c *Client) TotalAlerts() (int, error) {
return c.Ent.Alert.Query().Count(c.CTX) return c.Ent.Alert.Query().Count(c.CTX)
} }

View file

@ -15,6 +15,13 @@ import (
"github.com/pkg/errors" "github.com/pkg/errors"
) )
type DecisionsByScenario struct {
Scenario string
Count int
Origin string
Type string
}
func BuildDecisionRequestWithFilter(query *ent.DecisionQuery, filter map[string][]string) (*ent.DecisionQuery, []*sql.Predicate, error) { func BuildDecisionRequestWithFilter(query *ent.DecisionQuery, filter map[string][]string) (*ent.DecisionQuery, []*sql.Predicate, error) {
var err error var err error
@ -231,6 +238,29 @@ func (c *Client) QueryAllDecisionsWithFilters(filters map[string][]string) ([]*e
return data, nil return data, nil
} }
func (c *Client) QueryDecisionCountByScenario(filters map[string][]string) ([]*DecisionsByScenario, error) {
query := c.Ent.Decision.Query().Where(
decision.UntilGT(time.Now().UTC()),
)
query, _, err := BuildDecisionRequestWithFilter(query, filters)
if err != nil {
c.Log.Warningf("QueryDecisionCountByScenario : %s", err)
return nil, errors.Wrap(QueryFail, "count all decisions with filters")
}
var r []*DecisionsByScenario
err = query.GroupBy(decision.FieldScenario, decision.FieldOrigin, decision.FieldType).Aggregate(ent.Count()).Scan(c.CTX, &r)
if err != nil {
c.Log.Warningf("QueryDecisionCountByScenario : %s", err)
return nil, errors.Wrap(QueryFail, "count all decisions with filters")
}
return r, nil
}
func (c *Client) QueryExpiredDecisionsWithFilters(filters map[string][]string) ([]*ent.Decision, error) { func (c *Client) QueryExpiredDecisionsWithFilters(filters map[string][]string) ([]*ent.Decision, error) {
now := time.Now().UTC() now := time.Now().UTC()
query := c.Ent.Decision.Query().Where( query := c.Ent.Decision.Query().Where(

View file

@ -31,7 +31,7 @@ declare stderr
run -1 --separate-stderr "${BIN_DIR}/crowdsec" run -1 --separate-stderr "${BIN_DIR}/crowdsec"
refute_output refute_output
run -0 echo "${stderr}" run -0 echo "${stderr}"
assert_output --partial "api server init: unable to run local API: unable to init database client: unknown database type 'meh'" assert_output --partial "unable to create database client: unknown database type 'meh'"
} }
@test "${FILE} CS_LAPI_SECRET not strong enough" { @test "${FILE} CS_LAPI_SECRET not strong enough" {