@@ -24,12 +24,10 @@ import (
2424 "context"
2525 "time"
2626
27- "github.com/arangodb/kube-arangodb/pkg/util/globals"
28-
2927 "github.com/arangodb/kube-arangodb/pkg/util/errors"
3028
3129 api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
32- "github.com/arangodb/kube-arangodb/pkg/util/arangod "
30+ "github.com/arangodb/kube-arangodb/pkg/deployment/agency "
3331)
3432
3533const (
@@ -74,10 +72,8 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
7472 if m .IsNotReadySince (time .Now ().Add (- notReadySinceGracePeriod )) {
7573 // Member has terminated too often in recent history.
7674
77- failureAcceptable , reason , err := r .isMemberFailureAcceptable (ctx , group , m )
78- if err != nil {
79- log .Err (err ).Warn ("Failed to check is member failure is acceptable" )
80- } else if failureAcceptable {
75+ failureAcceptable , reason := r .isMemberFailureAcceptable (group , m )
76+ if failureAcceptable {
8177 log .Info ("Member is not ready for long time, marking is failed" )
8278 m .Phase = api .MemberPhaseFailed
8379 status .Members .Update (m , group )
@@ -93,10 +89,8 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
9389 count := m .RecentTerminationsSince (time .Now ().Add (- recentTerminationsSinceGracePeriod ))
9490 if count >= recentTerminationThreshold {
9591 // Member has terminated too often in recent history.
96- failureAcceptable , reason , err := r .isMemberFailureAcceptable (ctx , group , m )
97- if err != nil {
98- log .Err (err ).Warn ("Failed to check is member failure is acceptable" )
99- } else if failureAcceptable {
92+ failureAcceptable , reason := r .isMemberFailureAcceptable (group , m )
93+ if failureAcceptable {
10094 log .Info ("Member has terminated too often in recent history, marking is failed" )
10195 m .Phase = api .MemberPhaseFailed
10296 status .Members .Update (m , group )
@@ -123,42 +117,46 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
123117
124118// isMemberFailureAcceptable checks if it is currently acceptable to switch the phase of the given member
125119// to failed, which means that it will be replaced.
126- // Return: failureAcceptable, notAcceptableReason, error
127- func (r * Resilience ) isMemberFailureAcceptable (ctx context. Context , group api.ServerGroup , m api.MemberStatus ) (bool , string , error ) {
120+ // Return: failureAcceptable, notAcceptableReason
121+ func (r * Resilience ) isMemberFailureAcceptable (group api.ServerGroup , m api.MemberStatus ) (bool , string ) {
128122
129123 switch group {
130124 case api .ServerGroupAgents :
131125 agencyHealth , ok := r .context .GetAgencyHealth ()
132126 if ! ok {
133- return false , "AgencyHealth is not present" , nil
127+ return false , "AgencyHealth is not present"
134128 }
135129
136130 if err := agencyHealth .Healthy (); err != nil {
137- return false , err .Error (), nil
131+ return false , err .Error ()
138132 }
139133
140- return true , "" , nil
134+ return true , ""
141135 case api .ServerGroupDBServers :
142- ctxChild , cancel := globals .GetGlobalTimeouts ().ArangoD ().WithTimeout (ctx )
143- defer cancel ()
144- client , err := r .context .GetDatabaseClient (ctxChild )
145- if err != nil {
146- return false , "" , errors .WithStack (err )
136+ agencyState , ok := r .context .GetAgencyCache ()
137+ if ! ok {
138+ return false , "AgencyHealth is not present"
139+ }
140+
141+ if agencyState .Plan .Collections .IsDBServerPresent (agency .Server (m .ID )) {
142+ return false , "DBServer still in Plan"
147143 }
148- if err := arangod .IsDBServerEmpty (ctx , m .ID , client ); err != nil {
149- return false , err .Error (), nil
144+
145+ if agencyState .Current .Collections .IsDBServerPresent (agency .Server (m .ID )) {
146+ return false , "DBServer still in Current"
150147 }
151- return true , "" , nil
148+
149+ return true , ""
152150 case api .ServerGroupCoordinators :
153151 // Coordinators can be replaced at will
154- return true , "" , nil
152+ return true , ""
155153 case api .ServerGroupSyncMasters , api .ServerGroupSyncWorkers :
156154 // Sync masters & workers can be replaced at will
157- return true , "" , nil
155+ return true , ""
158156 case api .ServerGroupSingle :
159- return false , "ServerGroupSingle can not marked as a failed" , nil
157+ return false , "ServerGroupSingle can not marked as a failed"
160158 default :
161159 // TODO
162- return false , "TODO" , nil
160+ return false , "TODO"
163161 }
164162}
0 commit comments