123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363 |
- #define BENCHMARK "OSU MPI%s Bi-Directional Bandwidth Test"
- /*
- * Copyright (C) 2002-2021 the Network-Based Computing Laboratory
- * (NBCL), The Ohio State University.
- *
- * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
- *
- * For detailed copyright and licensing information, please refer to the
- * copyright file COPYRIGHT in the top level OMB directory.
- */
- #include <osu_util_mpi.h>
- #ifdef _ENABLE_CUDA_KERNEL_
- double measure_kernel_lo(char **, int, int);
- void touch_managed_src(char **, int, int);
- void touch_managed_dst(char **, int, int);
- #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
- double calculate_total(double, double, double, int);
- int main(int argc, char *argv[])
- {
- int myid, numprocs, i, j;
- int size;
- char **s_buf, **r_buf;
- double t_start = 0.0, t_end = 0.0, t_lo = 0.0, t_total = 0.0;
- int window_size = 64;
- int po_ret = 0;
- options.bench = PT2PT;
- options.subtype = BW;
- set_header(HEADER);
- set_benchmark_name("osu_bibw");
-
- po_ret = process_options(argc, argv);
- if (PO_OKAY == po_ret && NONE != options.accel) {
- if (init_accel()) {
- fprintf(stderr, "Error initializing device\n");
- exit(EXIT_FAILURE);
- }
- }
- window_size = options.window_size;
- if (options.buf_num == MULTIPLE) {
- s_buf = malloc(sizeof(char *) * window_size);
- r_buf = malloc(sizeof(char *) * window_size);
- } else {
- s_buf = malloc(sizeof(char *) * 1);
- r_buf = malloc(sizeof(char *) * 1);
- }
- MPI_CHECK(MPI_Init(&argc, &argv));
- MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs));
- MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myid));
- if (0 == myid) {
- switch (po_ret) {
- case PO_CUDA_NOT_AVAIL:
- fprintf(stderr, "CUDA support not enabled. Please recompile "
- "benchmark with CUDA support.\n");
- break;
- case PO_OPENACC_NOT_AVAIL:
- fprintf(stderr, "OPENACC support not enabled. Please "
- "recompile benchmark with OPENACC support.\n");
- break;
- case PO_BAD_USAGE:
- print_bad_usage_message(myid);
- break;
- case PO_HELP_MESSAGE:
- print_help_message(myid);
- break;
- case PO_VERSION_MESSAGE:
- print_version_message(myid);
- MPI_CHECK(MPI_Finalize());
- exit(EXIT_SUCCESS);
- case PO_OKAY:
- break;
- }
- }
- switch (po_ret) {
- case PO_CUDA_NOT_AVAIL:
- case PO_OPENACC_NOT_AVAIL:
- case PO_BAD_USAGE:
- MPI_CHECK(MPI_Finalize());
- exit(EXIT_FAILURE);
- case PO_HELP_MESSAGE:
- case PO_VERSION_MESSAGE:
- MPI_CHECK(MPI_Finalize());
- exit(EXIT_SUCCESS);
- case PO_OKAY:
- break;
- }
- if (numprocs != 2) {
- if (myid == 0) {
- fprintf(stderr, "This test requires exactly two processes\n");
- }
- MPI_CHECK(MPI_Finalize());
- exit(EXIT_FAILURE);
- }
- #ifdef _ENABLE_CUDA_
- if (options.src == 'M' || options.dst == 'M') {
- if (options.buf_num == SINGLE) {
- fprintf(stderr, "Warning: Tests involving managed buffers will use multiple buffers by default\n");
- }
- options.buf_num = MULTIPLE;
- }
- #endif
- if (options.buf_num == SINGLE) {
- if (allocate_memory_pt2pt(&s_buf[0], &r_buf[0], myid)) {
- /* Error allocating memory */
- MPI_CHECK(MPI_Finalize());
- exit(EXIT_FAILURE);
- }
- }
- print_header(myid, BW);
- /* Bi-Directional Bandwidth test */
- for (size = options.min_message_size; size <= options.max_message_size; size *= 2) {
- if (options.buf_num == MULTIPLE) {
- for (i = 0; i < window_size; i++) {
- if (allocate_memory_pt2pt_size(&s_buf[i], &r_buf[i], myid, size)) {
- /* Error allocating memory */
- MPI_CHECK(MPI_Finalize());
- exit(EXIT_FAILURE);
- }
- }
-
- /* touch the data */
- for (i = 0; i < window_size; i++) {
- set_buffer_pt2pt(s_buf[i], myid, options.accel, 'a', size);
- set_buffer_pt2pt(r_buf[i], myid, options.accel, 'b', size);
- }
- } else {
- /* touch the data */
- set_buffer_pt2pt(s_buf[0], myid, options.accel, 'a', size);
- set_buffer_pt2pt(r_buf[0], myid, options.accel, 'b', size);
- }
- if (size > LARGE_MESSAGE_SIZE) {
- options.iterations = options.iterations_large;
- options.skip = options.skip_large;
- }
- #ifdef _ENABLE_CUDA_KERNEL_
- if ((options.src == 'M' && options.MMsrc == 'D') || (options.dst == 'M' && options.MMdst == 'D')) {
- t_lo = measure_kernel_lo(s_buf, size, window_size);
- }
- #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
- MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
- t_total = 0.0;
- for (i = 0; i < options.iterations + options.skip; i++) {
- if (myid == 0) {
- if (i >= options.skip) {
- t_start = MPI_Wtime();
- }
- #ifdef _ENABLE_CUDA_KERNEL_
- if (options.src == 'M') {
- touch_managed_src(s_buf, size, window_size);
- }
- #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
- for (j = 0; j < window_size; j++) {
- if (options.buf_num == SINGLE) {
- MPI_CHECK(MPI_Irecv(r_buf[0], size, MPI_CHAR, 1, 10, MPI_COMM_WORLD,
- recv_request + j));
- } else {
- MPI_CHECK(MPI_Irecv(r_buf[j], size, MPI_CHAR, 1, 10, MPI_COMM_WORLD,
- recv_request + j));
- }
- }
- for (j = 0; j < window_size; j++) {
- if (options.buf_num == SINGLE) {
- MPI_CHECK(MPI_Isend(s_buf[0], size, MPI_CHAR, 1, 100, MPI_COMM_WORLD,
- send_request + j));
- } else {
- MPI_CHECK(MPI_Isend(s_buf[j], size, MPI_CHAR, 1, 100, MPI_COMM_WORLD,
- send_request + j));
- }
- }
- MPI_CHECK(MPI_Waitall(window_size, send_request, reqstat));
- MPI_CHECK(MPI_Waitall(window_size, recv_request, reqstat));
- #ifdef _ENABLE_CUDA_KERNEL_
- if (options.src == 'M') {
- touch_managed_src(r_buf, size, window_size);
- }
- #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
- if (i >= options.skip) {
- t_end = MPI_Wtime();
- t_total += calculate_total(t_start, t_end, t_lo, window_size);
- }
- } else {
- #ifdef _ENABLE_CUDA_KERNEL_
- if (options.dst == 'M') {
- touch_managed_dst(s_buf, size, window_size);
- }
- #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
- for (j = 0; j < window_size; j++) {
- if (options.buf_num == SINGLE) {
- MPI_CHECK(MPI_Irecv(r_buf[0], size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
- recv_request + j));
- } else {
- MPI_CHECK(MPI_Irecv(r_buf[j], size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
- recv_request + j));
- }
- }
- for (j = 0; j < window_size; j++) {
- if (options.buf_num == SINGLE) {
- MPI_CHECK(MPI_Isend(s_buf[0], size, MPI_CHAR, 0, 10, MPI_COMM_WORLD,
- send_request + j));
- } else {
- MPI_CHECK(MPI_Isend(s_buf[j], size, MPI_CHAR, 0, 10, MPI_COMM_WORLD,
- send_request + j));
- }
- }
- MPI_CHECK(MPI_Waitall(window_size, recv_request, reqstat));
- #ifdef _ENABLE_CUDA_KERNEL_
- if (options.dst == 'M') {
- touch_managed_dst(r_buf, size, window_size);
- }
- #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
- MPI_CHECK(MPI_Waitall(window_size, send_request, reqstat));
- }
- }
- if (myid == 0) {
- double tmp = size / 1e6 * options.iterations * window_size * 2;
- fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH,
- FLOAT_PRECISION, tmp / t_total);
- fflush(stdout);
- }
- if (options.buf_num == MULTIPLE) {
- for (i = 0; i < window_size; i++) {
- free_memory(s_buf[i], r_buf[i], myid);
- }
- }
- }
- if (options.buf_num == SINGLE) {
- free_memory(s_buf[0], r_buf[0], myid);
- }
- free(s_buf);
- free(r_buf);
- MPI_CHECK(MPI_Finalize());
- if (NONE != options.accel) {
- if (cleanup_accel()) {
- fprintf(stderr, "Error cleaning up device\n");
- exit(EXIT_FAILURE);
- }
- }
- return EXIT_SUCCESS;
- }
- #ifdef _ENABLE_CUDA_KERNEL_
- double
- measure_kernel_lo(char **buf, int size, int window_size)
- {
- int i;
- double t_lo = 0.0, t_start, t_end;
- for (i = 0; i < 10; i++) {
- launch_empty_kernel(buf[i%window_size], size);//Warmup
- }
- for (i = 0; i < 1000; i++) {
- t_start = MPI_Wtime();
- launch_empty_kernel(buf[i%window_size], size);
- synchronize_stream();
- t_end = MPI_Wtime();
- t_lo = t_lo + (t_end - t_start);
- }
- t_lo = t_lo/1000;//Averaging the kernel launch overhead
- return t_lo;
- }
- void
- touch_managed_src(char **buf, int size, int window_size)
- {
- int j;
- if (options.src == 'M') {
- if (options.MMsrc == 'D') {
- for (j = 0; j < window_size; j++) {
- touch_managed(buf[j], size);
- synchronize_stream();
- }
- } else if ((options.MMsrc == 'H') && size > PREFETCH_THRESHOLD) {
- for (j = 0; j < window_size; j++) {
- prefetch_data(buf[j], size, -1);
- synchronize_stream();
- }
- } else {
- for (j = 0; j < window_size; j++) {
- memset(buf[j], 'c', size);
- }
- }
- }
- }
- void
- touch_managed_dst(char **buf, int size, int window_size)
- {
- int j;
- if (options.dst == 'M') {
- if (options.MMdst == 'D') {
- for (j = 0; j < window_size; j++) {
- touch_managed(buf[j], size);
- synchronize_stream();
- }
- } else if ((options.MMdst == 'H') && size > PREFETCH_THRESHOLD) {
- for (j = 0; j < window_size; j++) {
- prefetch_data(buf[j], size, -1);
- synchronize_stream();
- }
- } else {
- for (j = 0; j < window_size; j++) {
- memset(buf[j], 'c', size);
- }
- }
- }
- }
- #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
- double calculate_total(double t_start, double t_end, double t_lo, int window_size)
- {
- double t_total;
- if ((options.src == 'M' && options.MMsrc == 'D') &&
- (options.dst == 'M' && options.MMdst == 'D')) {
- t_total = ((t_end - t_start) - (2 * t_lo * window_size));
- } else if ((options.src == 'M' && options.MMsrc == 'D') ||
- (options.dst == 'M' && options.MMdst == 'D')) {
- t_total = ((t_end - t_start) - (t_lo * window_size));
- } else {
- t_total = (t_end - t_start);
- }
- return t_total;
- }
|