M2_IHPS
/
GLCS-CM2-2024


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
							#define BENCHMARK "OSU MPI%s Bi-Directional Bandwidth Test"
/*
 * Copyright (C) 2002-2021 the Network-Based Computing Laboratory
 * (NBCL), The Ohio State University. 
 *
 * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
 *
 * For detailed copyright and licensing information, please refer to the
 * copyright file COPYRIGHT in the top level OMB directory.
 */
#include <osu_util_mpi.h>

#ifdef _ENABLE_CUDA_KERNEL_
double measure_kernel_lo(char **, int, int);
void touch_managed_src(char **, int, int);
void touch_managed_dst(char **, int, int);
#endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
double calculate_total(double, double, double, int);

int main(int argc, char *argv[])
{
    int myid, numprocs, i, j;
    int size;
    char **s_buf, **r_buf;
    double t_start = 0.0, t_end = 0.0, t_lo = 0.0, t_total = 0.0;
    int window_size = 64;
    int po_ret = 0;
    options.bench = PT2PT;
    options.subtype = BW;

    set_header(HEADER);
    set_benchmark_name("osu_bibw");
    
    po_ret = process_options(argc, argv);
    if (PO_OKAY == po_ret && NONE != options.accel) {
        if (init_accel()) {
            fprintf(stderr, "Error initializing device\n");
            exit(EXIT_FAILURE);
        }
    }

    window_size = options.window_size;
    if (options.buf_num == MULTIPLE) {
        s_buf = malloc(sizeof(char *) * window_size);
        r_buf = malloc(sizeof(char *) * window_size);
    } else {
        s_buf = malloc(sizeof(char *) * 1);
        r_buf = malloc(sizeof(char *) * 1);
    }

    MPI_CHECK(MPI_Init(&argc, &argv));
    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs));
    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myid));

    if (0 == myid) {
        switch (po_ret) {
            case PO_CUDA_NOT_AVAIL:
                fprintf(stderr, "CUDA support not enabled.  Please recompile "
                        "benchmark with CUDA support.\n");
                break;
            case PO_OPENACC_NOT_AVAIL:
                fprintf(stderr, "OPENACC support not enabled.  Please "
                        "recompile benchmark with OPENACC support.\n");
                break;
            case PO_BAD_USAGE:
                print_bad_usage_message(myid);
                break;
            case PO_HELP_MESSAGE:
                print_help_message(myid);
                break;
            case PO_VERSION_MESSAGE:
                print_version_message(myid);
                MPI_CHECK(MPI_Finalize());
                exit(EXIT_SUCCESS);
            case PO_OKAY:
                break;
        }
    }

    switch (po_ret) {
        case PO_CUDA_NOT_AVAIL:
        case PO_OPENACC_NOT_AVAIL:
        case PO_BAD_USAGE:
            MPI_CHECK(MPI_Finalize());
            exit(EXIT_FAILURE);
        case PO_HELP_MESSAGE:
        case PO_VERSION_MESSAGE:
            MPI_CHECK(MPI_Finalize());
            exit(EXIT_SUCCESS);
        case PO_OKAY:
            break;
    }

    if (numprocs != 2) {
        if (myid == 0) {
            fprintf(stderr, "This test requires exactly two processes\n");
        }

        MPI_CHECK(MPI_Finalize());
        exit(EXIT_FAILURE);
    }

#ifdef _ENABLE_CUDA_
    if (options.src == 'M' || options.dst == 'M') {
        if (options.buf_num == SINGLE) {
            fprintf(stderr, "Warning: Tests involving managed buffers will use multiple buffers by default\n");
        }
        options.buf_num = MULTIPLE;
    }
#endif

    if (options.buf_num == SINGLE) {
        if (allocate_memory_pt2pt(&s_buf[0], &r_buf[0], myid)) {
            /* Error allocating memory */
            MPI_CHECK(MPI_Finalize());
            exit(EXIT_FAILURE);
        }
    }

    print_header(myid, BW);

    /* Bi-Directional Bandwidth test */
    for (size = options.min_message_size; size <= options.max_message_size; size *= 2) {

        if (options.buf_num == MULTIPLE) {
            for (i = 0; i < window_size; i++) {
                if (allocate_memory_pt2pt_size(&s_buf[i], &r_buf[i], myid, size)) {
                    /* Error allocating memory */
                    MPI_CHECK(MPI_Finalize());
                    exit(EXIT_FAILURE);
                }
            }
    
            /* touch the data */
            for (i = 0; i < window_size; i++) { 
                set_buffer_pt2pt(s_buf[i], myid, options.accel, 'a', size);
                set_buffer_pt2pt(r_buf[i], myid, options.accel, 'b', size);
            }
        } else {
            /* touch the data */
            set_buffer_pt2pt(s_buf[0], myid, options.accel, 'a', size);
            set_buffer_pt2pt(r_buf[0], myid, options.accel, 'b', size);
        }

        if (size > LARGE_MESSAGE_SIZE) {
            options.iterations = options.iterations_large;
            options.skip = options.skip_large;
        }
#ifdef _ENABLE_CUDA_KERNEL_
        if ((options.src == 'M' && options.MMsrc == 'D') || (options.dst == 'M' && options.MMdst == 'D')) {
            t_lo = measure_kernel_lo(s_buf, size, window_size);
        }
#endif /* #ifdef _ENABLE_CUDA_KERNEL_ */

        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
        t_total = 0.0;

        for (i = 0; i < options.iterations + options.skip; i++) {
            if (myid == 0) {
                if (i >= options.skip) {
                    t_start = MPI_Wtime();
                }

#ifdef _ENABLE_CUDA_KERNEL_
                if (options.src == 'M') {
                    touch_managed_src(s_buf, size, window_size);
                }
#endif /* #ifdef _ENABLE_CUDA_KERNEL_ */

                for (j = 0; j < window_size; j++) {
                    if (options.buf_num == SINGLE) {
                        MPI_CHECK(MPI_Irecv(r_buf[0], size, MPI_CHAR, 1, 10, MPI_COMM_WORLD,
                                  recv_request + j));
                    } else {
                        MPI_CHECK(MPI_Irecv(r_buf[j], size, MPI_CHAR, 1, 10, MPI_COMM_WORLD,
                                  recv_request + j));
                    }
                }

                for (j = 0; j < window_size; j++) {
                    if (options.buf_num == SINGLE) {
                        MPI_CHECK(MPI_Isend(s_buf[0], size, MPI_CHAR, 1, 100, MPI_COMM_WORLD,
                                  send_request + j));
                    } else {
                        MPI_CHECK(MPI_Isend(s_buf[j], size, MPI_CHAR, 1, 100, MPI_COMM_WORLD,
                                  send_request + j));
                    }
                }

                MPI_CHECK(MPI_Waitall(window_size, send_request, reqstat));

                MPI_CHECK(MPI_Waitall(window_size, recv_request, reqstat));

#ifdef _ENABLE_CUDA_KERNEL_
                if (options.src == 'M') {
                    touch_managed_src(r_buf, size, window_size);
                }
#endif /* #ifdef _ENABLE_CUDA_KERNEL_ */

                if (i >= options.skip) {
                    t_end = MPI_Wtime();
                    t_total += calculate_total(t_start, t_end, t_lo, window_size);
                }
            } else {
#ifdef _ENABLE_CUDA_KERNEL_
                if (options.dst == 'M') {
                    touch_managed_dst(s_buf, size, window_size);
                }
#endif /* #ifdef _ENABLE_CUDA_KERNEL_ */

                for (j = 0; j < window_size; j++) {
                    if (options.buf_num == SINGLE) {
                        MPI_CHECK(MPI_Irecv(r_buf[0], size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
                                  recv_request + j));
                    } else {
                        MPI_CHECK(MPI_Irecv(r_buf[j], size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
                                  recv_request + j));
                    }
                }

                for (j = 0; j < window_size; j++) {
                    if (options.buf_num == SINGLE) {
                        MPI_CHECK(MPI_Isend(s_buf[0], size, MPI_CHAR, 0, 10, MPI_COMM_WORLD,
                                  send_request + j));
                    } else {
                        MPI_CHECK(MPI_Isend(s_buf[j], size, MPI_CHAR, 0, 10, MPI_COMM_WORLD,
                                  send_request + j));
                    }
                }

                MPI_CHECK(MPI_Waitall(window_size, recv_request, reqstat));

#ifdef _ENABLE_CUDA_KERNEL_
                if (options.dst == 'M') {
                    touch_managed_dst(r_buf, size, window_size);
                }
#endif /* #ifdef _ENABLE_CUDA_KERNEL_ */

                MPI_CHECK(MPI_Waitall(window_size, send_request, reqstat));
            }
        }

        if (myid == 0) {
            double tmp = size / 1e6 * options.iterations * window_size * 2;

            fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH,
                    FLOAT_PRECISION, tmp / t_total);
            fflush(stdout);
        }

        if (options.buf_num == MULTIPLE) {
            for (i = 0; i < window_size; i++) { 
                free_memory(s_buf[i], r_buf[i], myid);
            }
        }
    }

    if (options.buf_num == SINGLE) {
        free_memory(s_buf[0], r_buf[0], myid);
    }
    free(s_buf);
    free(r_buf);

    MPI_CHECK(MPI_Finalize());

    if (NONE != options.accel) {
        if (cleanup_accel()) {
            fprintf(stderr, "Error cleaning up device\n");
            exit(EXIT_FAILURE);
        }
    }

    return EXIT_SUCCESS;
}

#ifdef _ENABLE_CUDA_KERNEL_
double
measure_kernel_lo(char **buf, int size, int window_size)
{
    int i;
    double t_lo = 0.0, t_start, t_end;

    for (i = 0; i < 10; i++) {
        launch_empty_kernel(buf[i%window_size], size);//Warmup
    }

    for (i = 0; i < 1000; i++) {
        t_start = MPI_Wtime();
        launch_empty_kernel(buf[i%window_size], size);
        synchronize_stream();
        t_end = MPI_Wtime();
        t_lo = t_lo + (t_end - t_start);
    }

    t_lo = t_lo/1000;//Averaging the kernel launch overhead
    return t_lo;
}

void
touch_managed_src(char **buf, int size, int window_size)
{
    int j;

    if (options.src == 'M') {
        if (options.MMsrc == 'D') {
            for (j = 0; j < window_size; j++) {
                touch_managed(buf[j], size);
                synchronize_stream();
            }
        } else if ((options.MMsrc == 'H') && size > PREFETCH_THRESHOLD) {
            for (j = 0; j < window_size; j++) {
                prefetch_data(buf[j], size, -1);
                synchronize_stream();
            }
        } else {
            for (j = 0; j < window_size; j++) {
                memset(buf[j], 'c', size);
            }
        }
    }
}

void
touch_managed_dst(char **buf, int size, int window_size)
{
    int j;

    if (options.dst == 'M') {
        if (options.MMdst == 'D') {
            for (j = 0; j < window_size; j++) {
                touch_managed(buf[j], size);
                synchronize_stream();
            }
        } else if ((options.MMdst == 'H') && size > PREFETCH_THRESHOLD) {
            for (j = 0; j < window_size; j++) {
                prefetch_data(buf[j], size, -1);
                synchronize_stream();
            }
        } else {
            for (j = 0; j < window_size; j++) {
                memset(buf[j], 'c', size);
            }
        }
    }
}
#endif /* #ifdef _ENABLE_CUDA_KERNEL_ */

double calculate_total(double t_start, double t_end, double t_lo, int window_size)
{
    double t_total;

    if ((options.src == 'M' && options.MMsrc == 'D') &&
        (options.dst == 'M' && options.MMdst == 'D')) {
        t_total = ((t_end - t_start) - (2 * t_lo * window_size));
    } else if ((options.src == 'M' && options.MMsrc == 'D') ||
               (options.dst == 'M' && options.MMdst == 'D')) {
        t_total = ((t_end - t_start) - (t_lo * window_size));
    } else {
        t_total = (t_end - t_start);
    }

    return t_total;
}