Work on failed outbound requests and move type migrators
- DB type migrators are now in separate file, in preparation for full custom sql migration statements - Start work on handling failed outbound requests stored in the db
This commit is contained in:
parent
81a01fbf8b
commit
7ac4c628b8
13 changed files with 372 additions and 145 deletions
30
storage-new/cleaners/killDeadServers.go
Normal file
30
storage-new/cleaners/killDeadServers.go
Normal file
|
@ -0,0 +1,30 @@
|
|||
package cleaners
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
|
||||
"git.mstar.dev/mstar/linstrom/storage-new/dbgen"
|
||||
)
|
||||
|
||||
const maxServerAge = time.Hour * 24 * 30 // One month
|
||||
|
||||
func init() {
|
||||
cleanerBuilders = append(cleanerBuilders, buildKillDeadServers)
|
||||
}
|
||||
|
||||
// Marks all servers where the last interaction time is older than maxServerAge
|
||||
func tickKillDeadServers(now time.Time) {
|
||||
_, err := dbgen.RemoteServer.Where(dbgen.RemoteServer.LastInteraction.Lt(now.Add(-maxServerAge)), dbgen.RemoteServer.IsSelf.Is(false)).
|
||||
UpdateColumn(dbgen.RemoteServer.IsDead, true)
|
||||
if err != nil {
|
||||
log.Error().
|
||||
Err(err).
|
||||
Msg("Failed to mark servers without interaction for over a 30 days as dead")
|
||||
}
|
||||
}
|
||||
|
||||
func buildKillDeadServers() (onTick func(time.Time), name string, tickSpeed time.Duration) {
|
||||
return tickKillDeadServers, "kill-dead-servers", time.Hour
|
||||
}
|
113
storage-new/cleaners/retryFailedRequests.go
Normal file
113
storage-new/cleaners/retryFailedRequests.go
Normal file
|
@ -0,0 +1,113 @@
|
|||
package cleaners
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"gorm.io/gen"
|
||||
|
||||
"git.mstar.dev/mstar/linstrom/storage-new/dbgen"
|
||||
"git.mstar.dev/mstar/linstrom/storage-new/models"
|
||||
webshared "git.mstar.dev/mstar/linstrom/web/shared"
|
||||
)
|
||||
|
||||
const maxFailedRequestsBeforeDeath = 10
|
||||
|
||||
var (
|
||||
reqErrStrUnknownHost = "no such host"
|
||||
)
|
||||
|
||||
func init() {
|
||||
cleanerBuilders = append(cleanerBuilders, buildRetryRequests)
|
||||
}
|
||||
|
||||
func TickRetryRequests(now time.Time) {
|
||||
batchResults := []*models.FailedOutboundRequest{}
|
||||
fo := dbgen.FailedOutboundRequest
|
||||
idsToDelete := []uint64{}
|
||||
var idsDeleteLock sync.Mutex
|
||||
err := fo.Preload(fo.TargetServer, fo.ActingUser).
|
||||
// Join with server data to exclude dead servers
|
||||
LeftJoin(dbgen.RemoteServer, dbgen.RemoteServer.ID.EqCol(fo.TargetServerId)).
|
||||
Where(dbgen.RemoteServer.IsDead.Is(false)).
|
||||
Order(fo.Id.Asc()).
|
||||
FindInBatches(&batchResults, 50, func(tx gen.Dao, batch int) error {
|
||||
var g errgroup.Group
|
||||
for _, failedRequest := range batchResults {
|
||||
g.Go(func() (err error) {
|
||||
defer func() {
|
||||
failedRequest.NrOfAttempts += 1
|
||||
err = dbgen.FailedOutboundRequest.Save(failedRequest)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if failedRequest.NrOfAttempts >= maxFailedRequestsBeforeDeath {
|
||||
_, err = dbgen.RemoteServer.Where(dbgen.RemoteServer.ID.Eq(failedRequest.TargetServerId)).
|
||||
UpdateColumn(dbgen.RemoteServer.IsDead, true)
|
||||
}
|
||||
}()
|
||||
var res *http.Response
|
||||
res, _, err = webshared.RequestSigned(
|
||||
"POST",
|
||||
failedRequest.Target,
|
||||
failedRequest.RawData,
|
||||
failedRequest.ActingUser,
|
||||
)
|
||||
if err != nil {
|
||||
failedRequest.NrOfAttempts += 1
|
||||
errString := err.Error()
|
||||
// FIXME: Use the actual error types instead of error string
|
||||
// Using substring matching is awful and probably not reliable. Using error type is likely more reliable
|
||||
if strings.Contains(errString, reqErrStrUnknownHost) {
|
||||
failedRequest.LastFailureReason = string(
|
||||
models.RequestFailureRequestError,
|
||||
)
|
||||
} else {
|
||||
failedRequest.LastFailureReason = string(models.RequestFailureRequestError)
|
||||
}
|
||||
return
|
||||
}
|
||||
if res.StatusCode < 400 {
|
||||
idsDeleteLock.Lock()
|
||||
idsToDelete = append(idsToDelete, failedRequest.Id)
|
||||
idsDeleteLock.Unlock()
|
||||
// Defer func will always add one (to make the expected failure case easier)
|
||||
// Sub one here to prevent a potential server kill if it was at maxFailedRequestsBeforeDeath-1 failed requests before
|
||||
failedRequest.NrOfAttempts -= 1
|
||||
return nil
|
||||
}
|
||||
|
||||
switch res.StatusCode {
|
||||
case http.StatusInternalServerError:
|
||||
failedRequest.LastFailureReason = string(models.RequestFailureInternalError)
|
||||
case http.StatusForbidden:
|
||||
failedRequest.LastFailureReason = string(models.RequestFailureRejected)
|
||||
case http.StatusTooManyRequests:
|
||||
// TODO: Check Timeout headers and write apropriate message
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("Failed to batch-process all failed outbound requests")
|
||||
}
|
||||
_, err = fo.Where(fo.Id.In(idsToDelete...)).Delete()
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("Failed to batch-delete all successful retries")
|
||||
}
|
||||
err = dbgen.FailedOutboundRequest.KillServers(maxFailedRequestsBeforeDeath)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("Failed to kill all servers with too many failed requests")
|
||||
}
|
||||
}
|
||||
|
||||
func buildRetryRequests() (onTick func(time.Time), name string, tickSpeed time.Duration) {
|
||||
return TickRetryRequests, "retry-requests", time.Hour
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue