From a82c5724f58e34904da4b7437aa2baf22380700f Mon Sep 17 00:00:00 2001 From: nathan Date: Tue, 22 Sep 2020 19:42:46 +0200 Subject: [PATCH] Added IPC addPub/Sub retries when aeron is restarted to quickly --- .../network/aeron/MediaDriverConnection.kt | 53 +++++++++++++++++-- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/src/dorkbox/network/aeron/MediaDriverConnection.kt b/src/dorkbox/network/aeron/MediaDriverConnection.kt index 0c7b473c..0ae79aff 100644 --- a/src/dorkbox/network/aeron/MediaDriverConnection.kt +++ b/src/dorkbox/network/aeron/MediaDriverConnection.kt @@ -304,8 +304,29 @@ class IpcMediaDriverConnection(override val streamId: Int, // NOTE: Handlers are called on the client conductor thread. The client conductor thread expects handlers to do safe // publication of any state to other threads and not be long running or re-entrant with the client. - val publication = aeron.addPublication(publicationUri.build(), streamId) - val subscription = aeron.addSubscription(subscriptionUri.build(), streamIdSubscription) + + // If we start/stop too quickly, we might have the aeron connectivity issues! Retry a few times. + var count = 10 + while (count-- > 0) { + try { + publication = aeron.addPublication(publicationUri.build(), streamId) + break + } catch (e: Exception) { + logger.warn(e) { "Unable to add a publication to Aeron. Retrying $count more times..." } + delay(5_000) + } + } + + count = 10 + while (count-- > 0) { + try { + subscription = aeron.addSubscription(subscriptionUri.build(), streamIdSubscription) + break + } catch (e: Exception) { + logger.warn(e) { "Unable to add a publication to Aeron. Retrying $count more times..." } + delay(5000) + } + } var success = false @@ -368,9 +389,31 @@ class IpcMediaDriverConnection(override val streamId: Int, // NOTE: Handlers are called on the client conductor thread. The client conductor thread expects handlers to do safe // publication of any state to other threads and not be long running or re-entrant with the client. - // NOTE: IPC doesn't have address bind issues if we start/stop too quickly - publication = aeron.addPublication(publicationUri.build(), streamId) - subscription = aeron.addSubscription(subscriptionUri.build(), streamIdSubscription) + // on close, the publication CAN linger (in case a client goes away, and then comes back) + // AERON_PUBLICATION_LINGER_TIMEOUT, 5s by default (this can also be set as a URI param) + + // If we start/stop too quickly, we might have the aeron connectivity issues! Retry a few times. + var count = 10 + while (count-- > 0) { + try { + publication = aeron.addPublication(publicationUri.build(), streamId) + break + } catch (e: Exception) { + logger.warn(e) { "Unable to add a publication to Aeron. Retrying $count more times..." } + delay(5_000) + } + } + + count = 10 + while (count-- > 0) { + try { + subscription = aeron.addSubscription(subscriptionUri.build(), streamIdSubscription) + break + } catch (e: Exception) { + logger.warn(e) { "Unable to add a publication to Aeron. Retrying $count more times..." } + delay(5000) + } + } } override fun clientInfo() : String {