Skip to content

PYTHON-1419 Connection failure to SNI endpoint when first host is unavailable #1243

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions cassandra/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,9 @@ def create(self, row):
class SniEndPoint(EndPoint):
"""SNI Proxy EndPoint implementation."""

def __init__(self, proxy_address, server_name, port=9042):
def __init__(self, proxy_address, server_name, port=9042, init_index=0):
self._proxy_address = proxy_address
self._index = 0
self._index = init_index
self._resolved_address = None # resolved address
self._port = port
self._server_name = server_name
Expand All @@ -267,8 +267,7 @@ def ssl_options(self):

def resolve(self):
try:
resolved_addresses = socket.getaddrinfo(self._proxy_address, self._port,
socket.AF_UNSPEC, socket.SOCK_STREAM)
resolved_addresses = self._resolve_proxy_addresses()
except socket.gaierror:
log.debug('Could not resolve sni proxy hostname "%s" '
'with port %d' % (self._proxy_address, self._port))
Expand All @@ -280,6 +279,10 @@ def resolve(self):

return self._resolved_address, self._port

def _resolve_proxy_addresses(self):
return socket.getaddrinfo(self._proxy_address, self._port,
socket.AF_UNSPEC, socket.SOCK_STREAM)

def __eq__(self, other):
return (isinstance(other, SniEndPoint) and
self.address == other.address and self.port == other.port and
Expand All @@ -305,16 +308,24 @@ class SniEndPointFactory(EndPointFactory):
def __init__(self, proxy_address, port):
self._proxy_address = proxy_address
self._port = port
# Initial lookup index to prevent all SNI endpoints to be resolved
# into the same starting IP address (which might not be available currently).
# If SNI resolves to 3 IPs, first endpoint will connect to first
# IP address, and subsequent resolutions to next IPs in round-robin
# fusion.
self._init_index = -1

def create(self, row):
host_id = row.get("host_id")
if host_id is None:
raise ValueError("No host_id to create the SniEndPoint")

return SniEndPoint(self._proxy_address, str(host_id), self._port)
self._init_index += 1
return SniEndPoint(self._proxy_address, str(host_id), self._port, self._init_index)

def create_from_sni(self, sni):
return SniEndPoint(self._proxy_address, sni, self._port)
self._init_index += 1
return SniEndPoint(self._proxy_address, sni, self._port, self._init_index)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's worthwhile being clear on the consequence of this change. If our proxy hostname resolves to N IP address we're basically exchanging a 1-in-N chance of complete failure if the first node is unavailable (because all our endpoints will return a common IP address) for an essentially guaranteed failure that the connection for one of our nodes will fail (since with the code in this PR at least one of our nodes will return the failing IP address from resolve()).

I'm not saying it's a problem that we're making this exchange; it probably is better to have a failure with connections to one of our nodes rather than to fail completely at connect time. But it is worth pointing out that this isn't a zero-cost abstraction.



@total_ordering
Expand Down
94 changes: 94 additions & 0 deletions tests/integration/long/test_sni_connection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright DataStax, Inc.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is actually non-deterministic; it will fail or succeed based on the order in which discovered nodes are added to the LBP. LBPs in the Python driver listen for node up and node down events; that's how they're populated with host information. Newly discovered hosts are added after any existing hosts. And since the delivery of these node up/down events is non-deterministic that means the ordering of nodes in an LBP is also non-deterministic.

There's an additional complication: the control connection. It also needs to call resolve() at creation time so that in turn throws the count off somewhat. If the control connection happens to get the end point that is configured to resolve to 127.0.0.1 first then the control connection will be established but subsequent connections to both nodes will try to use the mock IP address and you'll get a test failure like the following:

DEBUG    cassandra.pool:pool.py:206 Host proxy.datastax.com:9042:8c4b6ed7-f505-4226-b7a4-41f322520c1f:0 is now marked up                                                                                        
DEBUG    cassandra.pool:pool.py:206 Host proxy.datastax.com:9042:2e25021d-8d72-41a7-a247-3da85c5d92d2:1 is now marked up                                                                                        
DEBUG    cassandra.cluster:cluster.py:3596 [control connection] Opening new connection to proxy.datastax.com:9042:2e25021d-8d72-41a7-a247-3da85c5d92d2:1                                                        
INFO     cassandra.connection:connection.py:278 Resolved address: 127.0.0.1 
…
DEBUG	cassandra.cluster:cluster.py:1749 Control connection created
DEBUG	cassandra.pool:pool.py:405 Initializing connection for host proxy.datastax.com:9042:8c4b6ed7-f505-4226-b7a4-41f322520c1f:0
DEBUG	cassandra.pool:pool.py:405 Initializing connection for host [proxy.datastax.com:9042:2e25021d-8d72-41a7-a247-3da85c5d92d2:2](about:blank)
INFO 	cassandra.connection:connection.py:278 Resolved address: 100.101.102.103
INFO 	cassandra.connection:connection.py:278 Resolved address: 100.101.102.103
WARNING  cassandra.cluster:cluster.py:3244 Failed to create connection pool for new host [proxy.datastax.com:9042:2e25021d-8d72-41a7-a247-3da85c5d92d2:3](about:blank):
…
WARNING  cassandra.cluster:cluster.py:3244 Failed to create connection pool for new host [proxy.datastax.com:9042:8c4b6ed7-f505-4226-b7a4-41f322520c1f:1](about:blank):
…
WARNING  cassandra.cluster:cluster.py:2001 Host proxy.datastax.com:9042:2e25021d-8d72-41a7-a247-3da85c5d92d2:3 has been marked down
2025-05-23 21:48:59,794 WARNING [cluster:2001]: Host proxy.datastax.com:9042:8c4b6ed7-f505-4226-b7a4-41f322520c1f:1 has been marked down

If the order is reversed and the end point that's configured to use the mock IP address comes first the test passes. I don't clearly understand the order of events in that case but I think the control connection calls resolve(), gets the mock address, tries again (which calls resolve() again) and is able to connect... but I still can't explain how that enables connections to both nodes. Regardless it clearly does pass in that case.

I saw this when running the test locally; about half the time it would pass (mock address endpoint first in LBP) while half the time it didn't with the stack trace above (localhost endpoint first in LBP). I'm not sure that can be easily fixed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I missed that nuance before. If localhost is used for the control connection it does connect and then tries to open connection pools for both nodes (as referenced above). Both of those connection pools call resolve() which will return the invalid IP address for both endpoints. That means both connection pools will fail, which means both nodes are marked as down. But the control connection is connected to one of those nodes so it is also torn down... and a new control connection has to be made.

I'll readily admit I don't fully understand the sequence that causes this to lead to a NoHostAvailable... but it's doing so reliably in my testing. I think there's a couple iterations of the fail to establish connection pools leading to down nodes leading to new control connections until we get to a point where both end points return the phantom IP addresses and therefore we can't connect to anybody in the query plan. But that' still just speculation on my part.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the order is reversed and the end point that's configured to use the mock IP address comes first the test passes.

Correct, that is why I chose to mock IP '100.101.102.103', which is sorted before '127.0.0.1'.

I wanted to have as much integration tests as possible, but true, maybe unit test will be better in this case.

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import socket
import ssl
import unittest
from unittest.mock import patch

from cassandra import ProtocolVersion
from cassandra.cluster import Cluster, ControlConnection, ProfileManager
from cassandra.connection import SniEndPoint
from cassandra.datastax.cloud import CloudConfig
from cassandra.policies import ConstantReconnectionPolicy
from tests.integration import (
get_cluster, remove_cluster, CASSANDRA_IP
)
from tests.integration.long.test_ssl import setup_cluster_ssl, CLIENT_CA_CERTS, DRIVER_CERTFILE, DRIVER_KEYFILE


class SniConnectionTests(unittest.TestCase):

@classmethod
def setUpClass(cls):
setup_cluster_ssl(client_auth=True)

@classmethod
def tearDownClass(cls):
ccm_cluster = get_cluster()
ccm_cluster.stop()
remove_cluster()

def _mocked_cloud_config(self, cloud_config, create_pyopenssl_context):
config = CloudConfig.from_dict({})
config.sni_host = 'proxy.datastax.com'
config.sni_port = 9042
config.host_ids = ['8c4b6ed7-f505-4226-b7a4-41f322520c1f', '2e25021d-8d72-41a7-a247-3da85c5d92d2']

ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS)
ssl_context.load_verify_locations(CLIENT_CA_CERTS)
ssl_context.verify_mode = ssl.CERT_REQUIRED
ssl_context.load_cert_chain(certfile=DRIVER_CERTFILE, keyfile=DRIVER_KEYFILE)
config.ssl_context = ssl_context
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test bring along a lot of machinery that it doesn't really need because it's implemented as an integration test. Because we're using the Astra SNI integration via the config we have to worry about establishing SSL connections which means (a) we have to configure the C* cluster to support SSL and (b) we have to setup the SSL context here. This goes some way to obscuring the actual intent of the test. I kinda feel like a simpler test, maybe even a unit test, might be preferred... that way we can avoid all the SSL machinery.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's also the problem that with all of this SSL configuration the test won't pass unless the SSL update process being discussed in PYTHON-1372 (the ticket to fix test_ssl.py) is brought in. Without that fix (or more accurately the the update to the SSL material it entails) the test fails due to expired certificates.


return config

def _mocked_proxy_dns_resolution(self):
return [
# return wrong IP at first position, so that we make sure all SNI endpoints
# do not start with first IP only
(socket.AF_UNIX, socket.SOCK_STREAM, 0, None, ('100.101.102.103', 9042)),
(socket.AF_UNIX, socket.SOCK_STREAM, 0, None, (CASSANDRA_IP, 9042))
]

def _mocked_refresh_node_list_and_token_map(self, connection, preloaded_results=None,
force_token_rebuild=False):
return

def _mocked_refresh_schema(self, connection, preloaded_results=None, schema_agreement_wait=None,
force=False, **kwargs):
return

def _mocked_check_supported(self):
return

# Tests verifies that driver can connect to SNI endpoint even when one IP
# returned by the DNS resolution of SNI does not respond. Mocked SNI resolution method
# returns two IPs where only one corresponds to online C* cluster started with CCM.
def test_round_robin_dns_resolution(self):
with patch('cassandra.datastax.cloud.get_cloud_config', self._mocked_cloud_config):
with patch.object(SniEndPoint, '_resolve_proxy_addresses', self._mocked_proxy_dns_resolution):
# Mock below three functions, because host ID returned from proxy will not match ID present in C*
# Network connection should be already made, so we can consider our test successful
with patch.object(ControlConnection, '_refresh_node_list_and_token_map',
self._mocked_refresh_node_list_and_token_map):
with patch.object(ControlConnection, '_refresh_schema',
self._mocked_refresh_schema):
with patch.object(ProfileManager, 'check_supported', self._mocked_check_supported):
cloud_config = {
'secure_connect_bundle': '/path/to/secure-connect-dbname.zip'
}
cluster = Cluster(cloud=cloud_config, protocol_version=ProtocolVersion.V4, reconnection_policy=ConstantReconnectionPolicy(10))
session = cluster.connect()
session.shutdown()
cluster.shutdown()
12 changes: 12 additions & 0 deletions tests/unit/test_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,15 @@ def test_endpoint_resolve(self):
for i in range(10):
(address, _) = endpoint.resolve()
self.assertEqual(address, next(it))

def test_sni_resolution_start_index(self):
factory = SniEndPointFactory("proxy.datastax.com", 9999)
initial_index = factory._init_index

endpoint1 = factory.create_from_sni('sni1')
self.assertEqual(factory._init_index, initial_index + 1)
self.assertEqual(endpoint1._index, factory._init_index)

endpoint2 = factory.create_from_sni('sni2')
self.assertEqual(factory._init_index, initial_index + 2)
self.assertEqual(endpoint2._index, factory._init_index)