Skip to content
Snippets Groups Projects
scheduler.go 2.51 KiB
package main

import (
	"context"
	"math/rand"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"
	"go.science.ru.nl/cmd/repaird/backoff"
	"go.science.ru.nl/cmd/repaird/repair"
	"go.science.ru.nl/log"
	"go.science.ru.nl/promfmt"
)

var (
	metricLastRunTimestamp = promauto.NewGauge(prometheus.GaugeOpts{
		Name: "repair_last_run_time_seconds",
		Help: "Epoch timestamp of the last run.",
	})

	repairBrokenTimesamp = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Name: "repair_broken_time_seconds",
		Help: "Epoch timestamp of brokenness for name.",
	}, []string{"name"})
	repairAttemptedTotal = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Name: "repair_attempted_count_total",
		Help: "Gauge of total attempted repairs.",
	}, []string{"name"})
	repairFailedTotal = promauto.NewGaugeVec(prometheus.GaugeOpts{
		Name: "repair_failed_count_total",
		Help: "Gauge of total failed repairs.",
	}, []string{"name"})
)

func Run(ctx context.Context) {
	tick := time.NewTicker(60 * time.Second)
	timings := map[repair.Repairer]*backoff.B{}
	for r := range repair.R {
		timings[r] = backoff.New(backoff.DefaultDuration)
	}
	for {
		select {
		case <-ctx.Done():
			return
		case <-tick.C:
			// some jitter
			rand.Seed(time.Now().UnixNano())
			time.Sleep(time.Duration(rand.Intn(10)) * time.Second)
			metricLastRunTimestamp.Set(float64(time.Now().Unix()))
		}

		for r, name := range repair.R {
			log.Infof("Starting repair for: %q", name)
			terms := r.OfInterest()
			if len(terms) == 0 {
				log.Infof("Nothing found to repair for: %q", name)
			}

			ok := timings[r].OK()
			if !ok {
				log.Infof("Ratelimiter disallows repair for: %q, next repair allowed after %s", name, timings[r].Next().String())
				continue
			}
			for _, t := range terms {
				broken := r.Detect(t)

				// keep setting brokeness, so we can do 'time() - this' and see if it's <30s
				repairBrokenTimesamp.WithLabelValues(name).Set(float64(time.Now().Unix()))
				if broken != nil && ok {

					repairAttemptedTotal.WithLabelValues(name).Inc()

					log.Warningf("Found %q in need of repair for %s: %q", t, name, broken)
					failed := r.Repair(t)
					if failed != nil {
						log.Warningf("Failed to repair %q for %s: %q", t, name, failed)

						repairFailedTotal.WithLabelValues(name).Inc()
					}

					timings[r].Done()
				}
			}
		}

		if *flagWrite {
			if err := promfmt.WriteFile(*flagPromFile, promfmt.NewPrefixFilter("repair_")); err != nil {
				log.Fatalf("Failed to write to prom file: %s", err)
			}
		}
	}
}