Skip to content

Commit 4cf6454

Browse files
authored
Merge pull request #280 from atlassian/aqasimi/fix-scale-down
fix scale down logic
2 parents 80740ea + 8d73b08 commit 4cf6454

File tree

2 files changed

+218
-28
lines changed

2 files changed

+218
-28
lines changed

pkg/controller/controller.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -253,8 +253,7 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
253253
metrics.NodeGroupNodesForceTainted.WithLabelValues(nodegroup).Set(float64(len(forceTaintedNodes)))
254254
metrics.NodeGroupPods.WithLabelValues(nodegroup).Set(float64(len(pods)))
255255

256-
// We want to be really simple right now so we don't do anything if we are outside the range of allowed nodes
257-
// We assume it is a config error or something bad has gone wrong in the cluster
256+
// We dont need to handle the case where node count < minimum, but we do handle the case where node count > maximum
258257

259258
if len(allNodes) == 0 && len(pods) == 0 {
260259
log.WithField("nodegroup", nodegroup).Info("no pods requests and remain 0 node for node group")
@@ -270,15 +269,6 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
270269
)
271270
return 0, err
272271
}
273-
if len(allNodes) > nodeGroup.Opts.MaxNodes {
274-
err = errors.New("node count larger than the maximum")
275-
log.WithField("nodegroup", nodegroup).Warningf(
276-
"Node count of %v larger than maximum of %v",
277-
len(allNodes),
278-
nodeGroup.Opts.MaxNodes,
279-
)
280-
return 0, err
281-
}
282272

283273
// update the map of node to nodeinfo
284274
// for working out which pods are on which nodes
@@ -343,7 +333,7 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
343333
}
344334

345335
// Metrics
346-
log.WithField("nodegroup", nodegroup).Infof("cpu: %v, memory: %v", cpuPercent, memPercent)
336+
log.WithField("nodegroup", nodegroup).Infof("cpu: %.2f%%, memory: %.2f%%", cpuPercent, memPercent)
347337

348338
// on the case that we're scaling up from 0, emit 0 as the metrics to keep metrics sane
349339
if cpuPercent == math.MaxFloat64 || memPercent == math.MaxFloat64 {
@@ -407,6 +397,16 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
407397
nodesDelta = int(math.Max(float64(nodesDelta), 1))
408398
}
409399

400+
// Check if desired was set to higher than max
401+
if len(untaintedNodes) > nodeGroup.Opts.MaxNodes {
402+
excessNodes := len(untaintedNodes) - nodeGroup.Opts.MaxNodes
403+
log.WithField("nodegroup", nodegroup).
404+
Infof("Node count %v exceeds maximum %v. Forcing scale down of %v nodes",
405+
len(untaintedNodes), nodeGroup.Opts.MaxNodes, excessNodes)
406+
// Scale down at least the excess amount
407+
nodesDelta = int(math.Min(float64(nodesDelta), float64(-excessNodes)))
408+
}
409+
410410
log.WithField("nodegroup", nodegroup).Debugf("Delta: %v", nodesDelta)
411411

412412
scaleOptions := scaleOpts{

pkg/controller/controller_scale_node_group_test.go

Lines changed: 206 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ func TestScaleNodeGroup(t *testing.T) {
324324
nil,
325325
},
326326
{
327-
"node count less than the minimum",
327+
"node count less than the minimum, should return error",
328328
args{
329329
nodeArgs{1, 0, 0},
330330
buildTestPods(0, 0, 0),
@@ -338,21 +338,6 @@ func TestScaleNodeGroup(t *testing.T) {
338338
0,
339339
errors.New("node count less than the minimum"),
340340
},
341-
{
342-
"node count larger than the maximum",
343-
args{
344-
nodeArgs{10, 0, 0},
345-
buildTestPods(0, 0, 0),
346-
NodeGroupOptions{
347-
Name: "default",
348-
CloudProviderGroupName: "default",
349-
MaxNodes: 5,
350-
},
351-
ListerOptions{},
352-
},
353-
0,
354-
errors.New("node count larger than the maximum"),
355-
},
356341
{
357342
"node and pod usage/requests",
358343
args{
@@ -1061,6 +1046,211 @@ func TestScaleNodeGroup_MultipleRuns(t *testing.T) {
10611046
}
10621047
}
10631048

1049+
// TestScaleDesiredNodesExceedsMaxNodes tests the scenario where ASG desired is set higher than MaxNodes.
1050+
// The controller should trigger a scale-down to bring nodes back within the MaxNodes limit.
1051+
func TestScaleDesiredNodesExceedsMaxNodes(t *testing.T) {
1052+
type args struct {
1053+
nodes []*v1.Node
1054+
pods []*v1.Pod
1055+
nodeGroupOptions NodeGroupOptions
1056+
listerOptions ListerOptions
1057+
}
1058+
1059+
tests := []struct {
1060+
name string
1061+
args args
1062+
expectedNodeDelta int
1063+
err error
1064+
}{
1065+
{
1066+
"12 nodes exceeds max of 10, should scale down by 2",
1067+
args{
1068+
buildTestNodes(12, 2000, 8000),
1069+
buildTestPods(30, 500, 1000), // 62% CPU utilization, normally no scaling
1070+
NodeGroupOptions{
1071+
Name: "default",
1072+
CloudProviderGroupName: "default",
1073+
MinNodes: 5,
1074+
MaxNodes: 10,
1075+
TaintLowerCapacityThresholdPercent: 40,
1076+
TaintUpperCapacityThresholdPercent: 50,
1077+
ScaleUpThresholdPercent: 80,
1078+
},
1079+
ListerOptions{},
1080+
},
1081+
-2,
1082+
nil,
1083+
},
1084+
{
1085+
"15 nodes exceeds max of 10, should scale down by 5",
1086+
args{
1087+
buildTestNodes(15, 2000, 8000),
1088+
buildTestPods(30, 500, 1000), // 62% CPU utilization, normally no scaling
1089+
NodeGroupOptions{
1090+
Name: "default",
1091+
CloudProviderGroupName: "default",
1092+
MinNodes: 5,
1093+
MaxNodes: 10,
1094+
TaintLowerCapacityThresholdPercent: 40,
1095+
TaintUpperCapacityThresholdPercent: 50,
1096+
ScaleUpThresholdPercent: 80,
1097+
},
1098+
ListerOptions{},
1099+
},
1100+
-5,
1101+
nil,
1102+
},
1103+
{
1104+
"11 nodes exceeds max of 10, should scale down by 1",
1105+
args{
1106+
buildTestNodes(11, 2000, 8000),
1107+
buildTestPods(40, 500, 1000), // 90% CPU utilization, would normally scale up
1108+
NodeGroupOptions{
1109+
Name: "default",
1110+
CloudProviderGroupName: "default",
1111+
MinNodes: 5,
1112+
MaxNodes: 10,
1113+
TaintLowerCapacityThresholdPercent: 40,
1114+
TaintUpperCapacityThresholdPercent: 50,
1115+
ScaleUpThresholdPercent: 80,
1116+
},
1117+
ListerOptions{},
1118+
},
1119+
-1, // Even with high utilization, should force scale down 1 node
1120+
nil,
1121+
},
1122+
{
1123+
"10 nodes equals max, no scale down",
1124+
args{
1125+
buildTestNodes(10, 2000, 8000),
1126+
buildTestPods(26, 500, 1000), // 65% CPU utilization, normally no scaling
1127+
NodeGroupOptions{
1128+
Name: "default",
1129+
CloudProviderGroupName: "default",
1130+
MinNodes: 5,
1131+
MaxNodes: 10,
1132+
TaintLowerCapacityThresholdPercent: 40,
1133+
TaintUpperCapacityThresholdPercent: 50,
1134+
ScaleUpThresholdPercent: 80,
1135+
},
1136+
ListerOptions{},
1137+
},
1138+
0,
1139+
nil,
1140+
},
1141+
{
1142+
"14 nodes, 12 untainted exceed max of 10, scale down by FastNodeRemovalRate below MaxNodes",
1143+
args{
1144+
func() []*v1.Node {
1145+
// 10 untainted + 2 tainted = 12 total nodes
1146+
untainted := test.BuildTestNodes(12, test.NodeOpts{
1147+
CPU: 2000,
1148+
Mem: 8000,
1149+
})
1150+
tainted := test.BuildTestNodes(2, test.NodeOpts{
1151+
CPU: 2000,
1152+
Mem: 8000,
1153+
Tainted: true,
1154+
})
1155+
return append(untainted, tainted...)
1156+
}(),
1157+
buildTestPods(10, 500, 1000), // 20% CPU utilization, should scale down by FastNodeRemovalRate
1158+
NodeGroupOptions{
1159+
Name: "default",
1160+
CloudProviderGroupName: "default",
1161+
MinNodes: 5,
1162+
MaxNodes: 10,
1163+
TaintLowerCapacityThresholdPercent: 40,
1164+
TaintUpperCapacityThresholdPercent: 50,
1165+
FastNodeRemovalRate: 4,
1166+
},
1167+
ListerOptions{},
1168+
},
1169+
-4,
1170+
nil,
1171+
},
1172+
{
1173+
"12 nodes, 8 untainted dont exceed max of 10, no scale down",
1174+
args{
1175+
func() []*v1.Node {
1176+
// 8 untainted + 4 tainted = 12 total nodes
1177+
untainted := test.BuildTestNodes(8, test.NodeOpts{
1178+
CPU: 2000,
1179+
Mem: 8000,
1180+
})
1181+
tainted := test.BuildTestNodes(4, test.NodeOpts{
1182+
CPU: 2000,
1183+
Mem: 8000,
1184+
Tainted: true,
1185+
})
1186+
return append(untainted, tainted...)
1187+
}(),
1188+
buildTestPods(20, 500, 1000), // 62% CPU utilization, normally no scaling
1189+
NodeGroupOptions{
1190+
Name: "default",
1191+
CloudProviderGroupName: "default",
1192+
MinNodes: 5,
1193+
MaxNodes: 10,
1194+
ScaleUpThresholdPercent: 70,
1195+
TaintLowerCapacityThresholdPercent: 40,
1196+
TaintUpperCapacityThresholdPercent: 50,
1197+
},
1198+
ListerOptions{},
1199+
},
1200+
0,
1201+
nil,
1202+
},
1203+
}
1204+
1205+
for _, tt := range tests {
1206+
t.Run(tt.name, func(t *testing.T) {
1207+
nodeGroups := []NodeGroupOptions{tt.args.nodeGroupOptions}
1208+
ngName := tt.args.nodeGroupOptions.Name
1209+
client, opts, err := buildTestClient(tt.args.nodes, tt.args.pods, nodeGroups, tt.args.listerOptions)
1210+
require.NoError(t, err)
1211+
1212+
// For these test cases we only use 1 node group/cloud provider node group
1213+
nodeGroupSize := 1
1214+
1215+
// Create a test (mock) cloud provider
1216+
testCloudProvider := test.NewCloudProvider(nodeGroupSize)
1217+
testNodeGroup := test.NewNodeGroup(
1218+
tt.args.nodeGroupOptions.CloudProviderGroupName,
1219+
tt.args.nodeGroupOptions.Name,
1220+
int64(tt.args.nodeGroupOptions.MinNodes),
1221+
int64(tt.args.nodeGroupOptions.MaxNodes),
1222+
int64(len(tt.args.nodes)),
1223+
)
1224+
testCloudProvider.RegisterNodeGroup(testNodeGroup)
1225+
1226+
// Create a node group state with the mapping of node groups to the cloud providers node groups
1227+
nodeGroupsState := BuildNodeGroupsState(nodeGroupsStateOpts{
1228+
nodeGroups: nodeGroups,
1229+
client: *client,
1230+
})
1231+
1232+
controller := &Controller{
1233+
Client: client,
1234+
Opts: opts,
1235+
stopChan: nil,
1236+
nodeGroups: nodeGroupsState,
1237+
cloudProvider: testCloudProvider,
1238+
}
1239+
1240+
nodesDelta, err := controller.scaleNodeGroup(ngName, nodeGroupsState[ngName])
1241+
1242+
// Ensure there were no errors
1243+
if tt.err == nil {
1244+
require.NoError(t, err)
1245+
} else {
1246+
require.EqualError(t, tt.err, err.Error())
1247+
}
1248+
1249+
assert.Equal(t, tt.expectedNodeDelta, nodesDelta)
1250+
})
1251+
}
1252+
}
1253+
10641254
func TestScaleNodeGroupNodeMaxAge(t *testing.T) {
10651255
buildNode := func(creation stdtime.Time, tainted bool) *v1.Node {
10661256
return test.BuildTestNode(test.NodeOpts{

0 commit comments

Comments
 (0)