package cleaners import ( "net/http" "strings" "sync" "time" "github.com/rs/zerolog/log" "golang.org/x/sync/errgroup" "gorm.io/gen" "git.mstar.dev/mstar/linstrom/storage-new/dbgen" "git.mstar.dev/mstar/linstrom/storage-new/models" webshared "git.mstar.dev/mstar/linstrom/web/shared" ) const maxFailedRequestsBeforeDeath = 10 var ( reqErrStrUnknownHost = "no such host" ) func init() { cleanerBuilders = append(cleanerBuilders, buildRetryRequests) } func TickRetryRequests(now time.Time) { batchResults := []*models.FailedOutboundRequest{} fo := dbgen.FailedOutboundRequest idsToDelete := []uint64{} var idsDeleteLock sync.Mutex err := fo.Preload(fo.TargetServer, fo.ActingUser). // Join with server data to exclude dead servers LeftJoin(dbgen.RemoteServer, dbgen.RemoteServer.ID.EqCol(fo.TargetServerId)). Where(dbgen.RemoteServer.IsDead.Is(false)). Order(fo.Id.Asc()). FindInBatches(&batchResults, 50, func(tx gen.Dao, batch int) error { var g errgroup.Group for _, failedRequest := range batchResults { g.Go(func() (err error) { defer func() { failedRequest.NrOfAttempts += 1 err = dbgen.FailedOutboundRequest.Save(failedRequest) if err != nil { return } if failedRequest.NrOfAttempts >= maxFailedRequestsBeforeDeath { _, err = dbgen.RemoteServer.Where(dbgen.RemoteServer.ID.Eq(failedRequest.TargetServerId)). UpdateColumn(dbgen.RemoteServer.IsDead, true) } }() var res *http.Response res, _, err = webshared.RequestSigned( "POST", failedRequest.Target, failedRequest.RawData, failedRequest.ActingUser, ) if err != nil { failedRequest.NrOfAttempts += 1 errString := err.Error() // FIXME: Use the actual error types instead of error string // Using substring matching is awful and probably not reliable. Using error type is likely more reliable if strings.Contains(errString, reqErrStrUnknownHost) { failedRequest.LastFailureReason = string( models.RequestFailureRequestError, ) } else { failedRequest.LastFailureReason = string(models.RequestFailureRequestError) } return } if res.StatusCode < 400 { idsDeleteLock.Lock() idsToDelete = append(idsToDelete, failedRequest.Id) idsDeleteLock.Unlock() // Defer func will always add one (to make the expected failure case easier) // Sub one here to prevent a potential server kill if it was at maxFailedRequestsBeforeDeath-1 failed requests before failedRequest.NrOfAttempts -= 1 return nil } switch res.StatusCode { case http.StatusInternalServerError: failedRequest.LastFailureReason = string(models.RequestFailureInternalError) case http.StatusForbidden: failedRequest.LastFailureReason = string(models.RequestFailureRejected) case http.StatusTooManyRequests: // TODO: Check Timeout headers and write apropriate message } return nil }) } return nil }) if err != nil { log.Error().Err(err).Msg("Failed to batch-process all failed outbound requests") } _, err = fo.Where(fo.Id.In(idsToDelete...)).Delete() if err != nil { log.Error().Err(err).Msg("Failed to batch-delete all successful retries") } err = dbgen.FailedOutboundRequest.KillServers(maxFailedRequestsBeforeDeath) if err != nil { log.Error().Err(err).Msg("Failed to kill all servers with too many failed requests") } } func buildRetryRequests() (onTick func(time.Time), name string, tickSpeed time.Duration) { return TickRetryRequests, "retry-requests", time.Hour }