|
4 | 4 | * reserved. |
5 | 5 | * Copyright (c) 2018 Triad National Security, LLC. All rights |
6 | 6 | * reserved. |
7 | | - * Copyright (c) 2019 Google, LLC. All rights reserved. |
| 7 | + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. |
8 | 8 | * $COPYRIGHT$ |
9 | 9 | * |
10 | 10 | * Additional copyrights may follow |
|
16 | 16 | #include "btl_uct.h" |
17 | 17 | #include "btl_uct_am.h" |
18 | 18 | #include "btl_uct_device_context.h" |
| 19 | +#include "opal/mca/timer/base/base.h" |
19 | 20 | #include "opal/util/proc.h" |
20 | 21 |
|
21 | 22 | static void mca_btl_uct_endpoint_construct(mca_btl_uct_endpoint_t *endpoint) |
@@ -257,21 +258,17 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl, |
257 | 258 | return OPAL_SUCCESS; |
258 | 259 | } |
259 | 260 |
|
260 | | -static int mca_btl_uct_endpoint_connect_endpoint( |
| 261 | +static int mca_btl_uct_endpoint_send_connection_data( |
261 | 262 | mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, |
262 | 263 | mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, |
263 | | - uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) |
| 264 | + uint8_t *conn_tl_data, int request_type) |
264 | 265 | { |
265 | | - size_t request_length = sizeof(mca_btl_uct_conn_req_t) |
266 | | - + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; |
267 | | - mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; |
268 | 266 | mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; |
269 | 267 | mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; |
270 | | - mca_btl_uct_conn_req_t *request = alloca(request_length); |
| 268 | + mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; |
271 | 269 | uct_device_addr_t *device_addr = NULL; |
272 | 270 | uct_iface_addr_t *iface_addr; |
273 | 271 | ucs_status_t ucs_status; |
274 | | - int rc; |
275 | 272 |
|
276 | 273 | assert(NULL != conn_tl); |
277 | 274 |
|
@@ -302,15 +299,50 @@ static int mca_btl_uct_endpoint_connect_endpoint( |
302 | 299 | ucs_status)); |
303 | 300 | return OPAL_ERROR; |
304 | 301 | } |
305 | | - } else { |
306 | | - OBJ_RETAIN(conn_ep); |
307 | 302 | } |
308 | 303 |
|
| 304 | + size_t request_length = sizeof(mca_btl_uct_conn_req_t) |
| 305 | + + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; |
| 306 | + mca_btl_uct_conn_req_t *request = alloca(request_length); |
| 307 | + |
309 | 308 | /* fill in common request parameters */ |
310 | 309 | request->proc_name = OPAL_PROC_MY_NAME; |
311 | 310 | request->context_id = tl_context->context_id; |
312 | 311 | request->tl_index = tl->tl_index; |
313 | | - request->type = !!(ep_addr); |
| 312 | + request->type = request_type; |
| 313 | + |
| 314 | + /* fill in connection request */ |
| 315 | + ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); |
| 316 | + if (UCS_OK != ucs_status) { |
| 317 | + /* this is a fatal a fatal error */ |
| 318 | + OBJ_RELEASE(endpoint->conn_ep); |
| 319 | + uct_ep_destroy(tl_endpoint->uct_ep); |
| 320 | + tl_endpoint->uct_ep = NULL; |
| 321 | + return OPAL_ERROR; |
| 322 | + } |
| 323 | + |
| 324 | + /* let the remote side know that the connection has been established and |
| 325 | + * wait for the message to be sent */ |
| 326 | + int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request, |
| 327 | + request_length); |
| 328 | + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { |
| 329 | + OBJ_RELEASE(endpoint->conn_ep); |
| 330 | + uct_ep_destroy(tl_endpoint->uct_ep); |
| 331 | + tl_endpoint->uct_ep = NULL; |
| 332 | + return OPAL_ERROR; |
| 333 | + } |
| 334 | + |
| 335 | + tl_endpoint->last_connection_req = opal_timer_base_get_usec(); |
| 336 | + |
| 337 | + return OPAL_SUCCESS; |
| 338 | +} |
| 339 | + |
| 340 | +static int mca_btl_uct_endpoint_connect_endpoint( |
| 341 | + mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, |
| 342 | + mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, |
| 343 | + uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) |
| 344 | +{ |
| 345 | + ucs_status_t ucs_status; |
314 | 346 |
|
315 | 347 | if (NULL == tl_endpoint->uct_ep) { |
316 | 348 | BTL_VERBOSE(("allocating endpoint for peer %s and sending connection data", |
@@ -338,29 +370,15 @@ static int mca_btl_uct_endpoint_connect_endpoint( |
338 | 370 | } |
339 | 371 | } |
340 | 372 |
|
341 | | - /* fill in connection request */ |
342 | | - ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); |
343 | | - if (UCS_OK != ucs_status) { |
344 | | - /* this is a fatal a fatal error */ |
345 | | - OBJ_RELEASE(endpoint->conn_ep); |
346 | | - uct_ep_destroy(tl_endpoint->uct_ep); |
347 | | - tl_endpoint->uct_ep = NULL; |
348 | | - return OPAL_ERROR; |
349 | | - } |
350 | | - |
351 | | - /* let the remote side know that the connection has been established and |
352 | | - * wait for the message to be sent */ |
353 | | - rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request, |
354 | | - request_length); |
355 | | - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { |
356 | | - OBJ_RELEASE(endpoint->conn_ep); |
357 | | - uct_ep_destroy(tl_endpoint->uct_ep); |
358 | | - tl_endpoint->uct_ep = NULL; |
359 | | - return OPAL_ERROR; |
| 373 | + opal_timer_t now = opal_timer_base_get_usec(); |
| 374 | + if ((now - tl_endpoint->last_connection_req) < mca_btl_uct_component.connection_retry_timeout && !ep_addr) { |
| 375 | + return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS |
| 376 | + : OPAL_ERR_OUT_OF_RESOURCE; |
360 | 377 | } |
361 | 378 |
|
362 | | - return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS |
363 | | - : OPAL_ERR_OUT_OF_RESOURCE; |
| 379 | + int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, |
| 380 | + conn_tl_data, /*request_type=*/!!ep_addr); |
| 381 | + return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc; |
364 | 382 | } |
365 | 383 |
|
366 | 384 | int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, |
|
0 commit comments