-
Miek Gieben authoredMiek Gieben authored
scheduler.go 2.51 KiB
package main
import (
"context"
"math/rand"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"go.science.ru.nl/cmd/repaird/backoff"
"go.science.ru.nl/cmd/repaird/repair"
"go.science.ru.nl/log"
"go.science.ru.nl/promfmt"
)
var (
metricLastRunTimestamp = promauto.NewGauge(prometheus.GaugeOpts{
Name: "repair_last_run_time_seconds",
Help: "Epoch timestamp of the last run.",
})
repairBrokenTimesamp = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "repair_broken_time_seconds",
Help: "Epoch timestamp of brokenness for name.",
}, []string{"name"})
repairAttemptedTotal = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "repair_attempted_count_total",
Help: "Gauge of total attempted repairs.",
}, []string{"name"})
repairFailedTotal = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "repair_failed_count_total",
Help: "Gauge of total failed repairs.",
}, []string{"name"})
)
func Run(ctx context.Context) {
tick := time.NewTicker(60 * time.Second)
timings := map[repair.Repairer]*backoff.B{}
for r := range repair.R {
timings[r] = backoff.New(backoff.DefaultDuration)
}
for {
select {
case <-ctx.Done():
return
case <-tick.C:
// some jitter
rand.Seed(time.Now().UnixNano())
time.Sleep(time.Duration(rand.Intn(10)) * time.Second)
metricLastRunTimestamp.Set(float64(time.Now().Unix()))
}
for r, name := range repair.R {
log.Infof("Starting repair for: %q", name)
terms := r.OfInterest()
if len(terms) == 0 {
log.Infof("Nothing found to repair for: %q", name)
}
ok := timings[r].OK()
if !ok {
log.Infof("Ratelimiter disallows repair for: %q, next repair allowed after %s", name, timings[r].Next().String())
continue
}
for _, t := range terms {
broken := r.Detect(t)
// keep setting brokeness, so we can do 'time() - this' and see if it's <30s
repairBrokenTimesamp.WithLabelValues(name).Set(float64(time.Now().Unix()))
if broken != nil && ok {
repairAttemptedTotal.WithLabelValues(name).Inc()
log.Warningf("Found %q in need of repair for %s: %q", t, name, broken)
failed := r.Repair(t)
if failed != nil {
log.Warningf("Failed to repair %q for %s: %q", t, name, failed)
repairFailedTotal.WithLabelValues(name).Inc()
}
timings[r].Done()
}
}
}
if *flagWrite {
if err := promfmt.WriteFile(*flagPromFile, promfmt.NewPrefixFilter("repair_")); err != nil {
log.Fatalf("Failed to write to prom file: %s", err)
}
}
}
}