xds: Align PriorityLB child selection with A56 The PriorityLB predates A56. tryNextPriority() now matches ChoosePriority() from the gRFC. The biggest change is waiting on CONNECTING children instead of failing after the failOverTimer fires. The failOverTimer should be used to start lower priorities more eagerly, but shouldn't cause the overall connectivity state to become TRANSIENT_FAILURE on its own. The prior behavior of creating the "Connection timeout for priority" failing picker was particularly strange, because it didn't update child's connectivity state. This previous behavior was creating errors because of the failOverTimer with no way to diagnose what was going wrong. b/428517222
diff --git a/xds/src/main/java/io/grpc/xds/PriorityLoadBalancer.java b/xds/src/main/java/io/grpc/xds/PriorityLoadBalancer.java index 845c167..c72cef9 100644 --- a/xds/src/main/java/io/grpc/xds/PriorityLoadBalancer.java +++ b/xds/src/main/java/io/grpc/xds/PriorityLoadBalancer.java
@@ -154,7 +154,8 @@ ChildLbState child = new ChildLbState(priority, priorityConfigs.get(priority).ignoreReresolution); children.put(priority, child); - updateOverallState(priority, CONNECTING, new FixedResultPicker(PickResult.withNoResult())); + // Child is created in CONNECTING with pending failOverTimer + updateOverallState(priority, child.connectivityState, child.picker); // Calling the child's updateResolvedAddresses() can result in tryNextPriority() being // called recursively. We need to be sure to be done with processing here before it is // called. @@ -173,21 +174,23 @@ } return Status.OK; } - if (child.failOverTimer != null && child.failOverTimer.isPending()) { + if (child.failOverTimer.isPending()) { updateOverallState(priority, child.connectivityState, child.picker); return Status.OK; // Give priority i time to connect. } - if (priority.equals(currentPriority) && child.connectivityState != TRANSIENT_FAILURE) { - // If the current priority is not changed into TRANSIENT_FAILURE, keep using it. + } + for (int i = 0; i < priorityNames.size(); i++) { + String priority = priorityNames.get(i); + ChildLbState child = children.get(priority); + if (child.connectivityState.equals(CONNECTING)) { updateOverallState(priority, child.connectivityState, child.picker); return Status.OK; } } - // TODO(zdapeng): Include error details of each priority. logger.log(XdsLogLevel.DEBUG, "All priority failed"); String lastPriority = priorityNames.get(priorityNames.size() - 1); - SubchannelPicker errorPicker = children.get(lastPriority).picker; - updateOverallState(lastPriority, TRANSIENT_FAILURE, errorPicker); + ChildLbState child = children.get(lastPriority); + updateOverallState(lastPriority, child.connectivityState, child.picker); return Status.OK; } @@ -231,10 +234,7 @@ // The child is deactivated. return; } - picker = new FixedResultPicker(PickResult.withError( - Status.UNAVAILABLE.withDescription("Connection timeout for priority " + priority))); logger.log(XdsLogLevel.DEBUG, "Priority {0} failed over to next", priority); - currentPriority = null; // reset currentPriority to guarantee failover happen Status status = tryNextPriority(); if (!status.isOk()) { // A child had a problem with the addresses/config. Request it to be refreshed
diff --git a/xds/src/test/java/io/grpc/xds/PriorityLoadBalancerTest.java b/xds/src/test/java/io/grpc/xds/PriorityLoadBalancerTest.java index 08d4863..ab4c003 100644 --- a/xds/src/test/java/io/grpc/xds/PriorityLoadBalancerTest.java +++ b/xds/src/test/java/io/grpc/xds/PriorityLoadBalancerTest.java
@@ -376,6 +376,7 @@ assertThat(fooBalancers).hasSize(2); assertThat(fooHelpers).hasSize(2); LoadBalancer balancer1 = Iterables.getLast(fooBalancers); + Helper helper1 = Iterables.getLast(fooHelpers); // p1 timeout, and fails over to p2 fakeClock.forwardTime(10, TimeUnit.SECONDS); @@ -423,14 +424,20 @@ LoadBalancer balancer3 = Iterables.getLast(fooBalancers); Helper helper3 = Iterables.getLast(fooHelpers); - // p3 timeout then the channel should go to TRANSIENT_FAILURE + // p3 timeout then the channel should stay in CONNECTING fakeClock.forwardTime(10, TimeUnit.SECONDS); - assertCurrentPickerReturnsError(Status.Code.UNAVAILABLE, "timeout"); + assertCurrentPicker(CONNECTING, PickResult.withNoResult()); - // p3 fails then the picker should have error status updated + // p3 fails then the picker should still be waiting on p1 helper3.updateBalancingState( TRANSIENT_FAILURE, new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("foo")))); + assertCurrentPicker(CONNECTING, PickResult.withNoResult()); + + // p1 fails then the picker should have error status updated to p3 + helper1.updateBalancingState( + TRANSIENT_FAILURE, + new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("bar")))); assertCurrentPickerReturnsError(Status.Code.DATA_LOSS, "foo"); // p2 gets back to READY @@ -642,6 +649,7 @@ assertThat(fooBalancers).hasSize(2); assertThat(fooHelpers).hasSize(2); LoadBalancer balancer1 = Iterables.getLast(fooBalancers); + Helper helper1 = Iterables.getLast(fooHelpers); // p1 timeout, and fails over to p2 fakeClock.forwardTime(10, TimeUnit.SECONDS); @@ -677,14 +685,20 @@ LoadBalancer balancer3 = Iterables.getLast(fooBalancers); Helper helper3 = Iterables.getLast(fooHelpers); - // p3 timeout then the channel should go to TRANSIENT_FAILURE + // p3 timeout then the channel should stay in CONNECTING fakeClock.forwardTime(10, TimeUnit.SECONDS); - assertCurrentPickerReturnsError(Status.Code.UNAVAILABLE, "timeout"); + assertCurrentPicker(CONNECTING, PickResult.withNoResult()); - // p3 fails then the picker should have error status updated + // p3 fails then the picker should still be waiting on p1 helper3.updateBalancingState( TRANSIENT_FAILURE, new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("foo")))); + assertCurrentPicker(CONNECTING, PickResult.withNoResult()); + + // p1 fails then the picker should have error status updated to p3 + helper1.updateBalancingState( + TRANSIENT_FAILURE, + new FixedResultPicker(PickResult.withError(Status.DATA_LOSS.withDescription("bar")))); assertCurrentPickerReturnsError(Status.Code.DATA_LOSS, "foo"); // p2 gets back to IDLE @@ -863,15 +877,17 @@ } private void assertCurrentPickerPicksSubchannel(Subchannel expectedSubchannelToPick) { - assertLatestConnectivityState(READY); - PickResult pickResult = pickerCaptor.getValue().pickSubchannel(mock(PickSubchannelArgs.class)); - assertThat(pickResult.getSubchannel()).isEqualTo(expectedSubchannelToPick); + assertCurrentPicker(READY, PickResult.withSubchannel(expectedSubchannelToPick)); } private void assertCurrentPickerIsBufferPicker() { - assertLatestConnectivityState(IDLE); + assertCurrentPicker(IDLE, PickResult.withNoResult()); + } + + private void assertCurrentPicker(ConnectivityState state, PickResult result) { + assertLatestConnectivityState(state); PickResult pickResult = pickerCaptor.getValue().pickSubchannel(mock(PickSubchannelArgs.class)); - assertThat(pickResult).isEqualTo(PickResult.withNoResult()); + assertThat(pickResult).isEqualTo(result); } private Object newChildConfig(LoadBalancerProvider provider, Object config) {