linstrom/storage-new/cleaners/retryFailedRequests.go
mstar 7ac4c628b8
Work on failed outbound requests and move type migrators
- DB type migrators are now in separate file, in preparation for full
  custom sql migration statements
- Start work on handling failed outbound requests stored in the db
2025-07-03 13:57:30 +02:00

113 lines
3.6 KiB
Go

package cleaners
import (
"net/http"
"strings"
"sync"
"time"
"github.com/rs/zerolog/log"
"golang.org/x/sync/errgroup"
"gorm.io/gen"
"git.mstar.dev/mstar/linstrom/storage-new/dbgen"
"git.mstar.dev/mstar/linstrom/storage-new/models"
webshared "git.mstar.dev/mstar/linstrom/web/shared"
)
const maxFailedRequestsBeforeDeath = 10
var (
reqErrStrUnknownHost = "no such host"
)
func init() {
cleanerBuilders = append(cleanerBuilders, buildRetryRequests)
}
func TickRetryRequests(now time.Time) {
batchResults := []*models.FailedOutboundRequest{}
fo := dbgen.FailedOutboundRequest
idsToDelete := []uint64{}
var idsDeleteLock sync.Mutex
err := fo.Preload(fo.TargetServer, fo.ActingUser).
// Join with server data to exclude dead servers
LeftJoin(dbgen.RemoteServer, dbgen.RemoteServer.ID.EqCol(fo.TargetServerId)).
Where(dbgen.RemoteServer.IsDead.Is(false)).
Order(fo.Id.Asc()).
FindInBatches(&batchResults, 50, func(tx gen.Dao, batch int) error {
var g errgroup.Group
for _, failedRequest := range batchResults {
g.Go(func() (err error) {
defer func() {
failedRequest.NrOfAttempts += 1
err = dbgen.FailedOutboundRequest.Save(failedRequest)
if err != nil {
return
}
if failedRequest.NrOfAttempts >= maxFailedRequestsBeforeDeath {
_, err = dbgen.RemoteServer.Where(dbgen.RemoteServer.ID.Eq(failedRequest.TargetServerId)).
UpdateColumn(dbgen.RemoteServer.IsDead, true)
}
}()
var res *http.Response
res, _, err = webshared.RequestSigned(
"POST",
failedRequest.Target,
failedRequest.RawData,
failedRequest.ActingUser,
)
if err != nil {
failedRequest.NrOfAttempts += 1
errString := err.Error()
// FIXME: Use the actual error types instead of error string
// Using substring matching is awful and probably not reliable. Using error type is likely more reliable
if strings.Contains(errString, reqErrStrUnknownHost) {
failedRequest.LastFailureReason = string(
models.RequestFailureRequestError,
)
} else {
failedRequest.LastFailureReason = string(models.RequestFailureRequestError)
}
return
}
if res.StatusCode < 400 {
idsDeleteLock.Lock()
idsToDelete = append(idsToDelete, failedRequest.Id)
idsDeleteLock.Unlock()
// Defer func will always add one (to make the expected failure case easier)
// Sub one here to prevent a potential server kill if it was at maxFailedRequestsBeforeDeath-1 failed requests before
failedRequest.NrOfAttempts -= 1
return nil
}
switch res.StatusCode {
case http.StatusInternalServerError:
failedRequest.LastFailureReason = string(models.RequestFailureInternalError)
case http.StatusForbidden:
failedRequest.LastFailureReason = string(models.RequestFailureRejected)
case http.StatusTooManyRequests:
// TODO: Check Timeout headers and write apropriate message
}
return nil
})
}
return nil
})
if err != nil {
log.Error().Err(err).Msg("Failed to batch-process all failed outbound requests")
}
_, err = fo.Where(fo.Id.In(idsToDelete...)).Delete()
if err != nil {
log.Error().Err(err).Msg("Failed to batch-delete all successful retries")
}
err = dbgen.FailedOutboundRequest.KillServers(maxFailedRequestsBeforeDeath)
if err != nil {
log.Error().Err(err).Msg("Failed to kill all servers with too many failed requests")
}
}
func buildRetryRequests() (onTick func(time.Time), name string, tickSpeed time.Duration) {
return TickRetryRequests, "retry-requests", time.Hour
}