osu_bibw.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. #define BENCHMARK "OSU MPI%s Bi-Directional Bandwidth Test"
  2. /*
  3. * Copyright (C) 2002-2021 the Network-Based Computing Laboratory
  4. * (NBCL), The Ohio State University.
  5. *
  6. * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu)
  7. *
  8. * For detailed copyright and licensing information, please refer to the
  9. * copyright file COPYRIGHT in the top level OMB directory.
  10. */
  11. #include <osu_util_mpi.h>
  12. #ifdef _ENABLE_CUDA_KERNEL_
  13. double measure_kernel_lo(char **, int, int);
  14. void touch_managed_src(char **, int, int);
  15. void touch_managed_dst(char **, int, int);
  16. #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
  17. double calculate_total(double, double, double, int);
  18. int main(int argc, char *argv[])
  19. {
  20. int myid, numprocs, i, j;
  21. int size;
  22. char **s_buf, **r_buf;
  23. double t_start = 0.0, t_end = 0.0, t_lo = 0.0, t_total = 0.0;
  24. int window_size = 64;
  25. int po_ret = 0;
  26. options.bench = PT2PT;
  27. options.subtype = BW;
  28. set_header(HEADER);
  29. set_benchmark_name("osu_bibw");
  30. po_ret = process_options(argc, argv);
  31. if (PO_OKAY == po_ret && NONE != options.accel) {
  32. if (init_accel()) {
  33. fprintf(stderr, "Error initializing device\n");
  34. exit(EXIT_FAILURE);
  35. }
  36. }
  37. window_size = options.window_size;
  38. if (options.buf_num == MULTIPLE) {
  39. s_buf = malloc(sizeof(char *) * window_size);
  40. r_buf = malloc(sizeof(char *) * window_size);
  41. } else {
  42. s_buf = malloc(sizeof(char *) * 1);
  43. r_buf = malloc(sizeof(char *) * 1);
  44. }
  45. MPI_CHECK(MPI_Init(&argc, &argv));
  46. MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &numprocs));
  47. MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myid));
  48. if (0 == myid) {
  49. switch (po_ret) {
  50. case PO_CUDA_NOT_AVAIL:
  51. fprintf(stderr, "CUDA support not enabled. Please recompile "
  52. "benchmark with CUDA support.\n");
  53. break;
  54. case PO_OPENACC_NOT_AVAIL:
  55. fprintf(stderr, "OPENACC support not enabled. Please "
  56. "recompile benchmark with OPENACC support.\n");
  57. break;
  58. case PO_BAD_USAGE:
  59. print_bad_usage_message(myid);
  60. break;
  61. case PO_HELP_MESSAGE:
  62. print_help_message(myid);
  63. break;
  64. case PO_VERSION_MESSAGE:
  65. print_version_message(myid);
  66. MPI_CHECK(MPI_Finalize());
  67. exit(EXIT_SUCCESS);
  68. case PO_OKAY:
  69. break;
  70. }
  71. }
  72. switch (po_ret) {
  73. case PO_CUDA_NOT_AVAIL:
  74. case PO_OPENACC_NOT_AVAIL:
  75. case PO_BAD_USAGE:
  76. MPI_CHECK(MPI_Finalize());
  77. exit(EXIT_FAILURE);
  78. case PO_HELP_MESSAGE:
  79. case PO_VERSION_MESSAGE:
  80. MPI_CHECK(MPI_Finalize());
  81. exit(EXIT_SUCCESS);
  82. case PO_OKAY:
  83. break;
  84. }
  85. if (numprocs != 2) {
  86. if (myid == 0) {
  87. fprintf(stderr, "This test requires exactly two processes\n");
  88. }
  89. MPI_CHECK(MPI_Finalize());
  90. exit(EXIT_FAILURE);
  91. }
  92. #ifdef _ENABLE_CUDA_
  93. if (options.src == 'M' || options.dst == 'M') {
  94. if (options.buf_num == SINGLE) {
  95. fprintf(stderr, "Warning: Tests involving managed buffers will use multiple buffers by default\n");
  96. }
  97. options.buf_num = MULTIPLE;
  98. }
  99. #endif
  100. if (options.buf_num == SINGLE) {
  101. if (allocate_memory_pt2pt(&s_buf[0], &r_buf[0], myid)) {
  102. /* Error allocating memory */
  103. MPI_CHECK(MPI_Finalize());
  104. exit(EXIT_FAILURE);
  105. }
  106. }
  107. print_header(myid, BW);
  108. /* Bi-Directional Bandwidth test */
  109. for (size = options.min_message_size; size <= options.max_message_size; size *= 2) {
  110. if (options.buf_num == MULTIPLE) {
  111. for (i = 0; i < window_size; i++) {
  112. if (allocate_memory_pt2pt_size(&s_buf[i], &r_buf[i], myid, size)) {
  113. /* Error allocating memory */
  114. MPI_CHECK(MPI_Finalize());
  115. exit(EXIT_FAILURE);
  116. }
  117. }
  118. /* touch the data */
  119. for (i = 0; i < window_size; i++) {
  120. set_buffer_pt2pt(s_buf[i], myid, options.accel, 'a', size);
  121. set_buffer_pt2pt(r_buf[i], myid, options.accel, 'b', size);
  122. }
  123. } else {
  124. /* touch the data */
  125. set_buffer_pt2pt(s_buf[0], myid, options.accel, 'a', size);
  126. set_buffer_pt2pt(r_buf[0], myid, options.accel, 'b', size);
  127. }
  128. if (size > LARGE_MESSAGE_SIZE) {
  129. options.iterations = options.iterations_large;
  130. options.skip = options.skip_large;
  131. }
  132. #ifdef _ENABLE_CUDA_KERNEL_
  133. if ((options.src == 'M' && options.MMsrc == 'D') || (options.dst == 'M' && options.MMdst == 'D')) {
  134. t_lo = measure_kernel_lo(s_buf, size, window_size);
  135. }
  136. #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
  137. MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
  138. t_total = 0.0;
  139. for (i = 0; i < options.iterations + options.skip; i++) {
  140. if (myid == 0) {
  141. if (i >= options.skip) {
  142. t_start = MPI_Wtime();
  143. }
  144. #ifdef _ENABLE_CUDA_KERNEL_
  145. if (options.src == 'M') {
  146. touch_managed_src(s_buf, size, window_size);
  147. }
  148. #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
  149. for (j = 0; j < window_size; j++) {
  150. if (options.buf_num == SINGLE) {
  151. MPI_CHECK(MPI_Irecv(r_buf[0], size, MPI_CHAR, 1, 10, MPI_COMM_WORLD,
  152. recv_request + j));
  153. } else {
  154. MPI_CHECK(MPI_Irecv(r_buf[j], size, MPI_CHAR, 1, 10, MPI_COMM_WORLD,
  155. recv_request + j));
  156. }
  157. }
  158. for (j = 0; j < window_size; j++) {
  159. if (options.buf_num == SINGLE) {
  160. MPI_CHECK(MPI_Isend(s_buf[0], size, MPI_CHAR, 1, 100, MPI_COMM_WORLD,
  161. send_request + j));
  162. } else {
  163. MPI_CHECK(MPI_Isend(s_buf[j], size, MPI_CHAR, 1, 100, MPI_COMM_WORLD,
  164. send_request + j));
  165. }
  166. }
  167. MPI_CHECK(MPI_Waitall(window_size, send_request, reqstat));
  168. MPI_CHECK(MPI_Waitall(window_size, recv_request, reqstat));
  169. #ifdef _ENABLE_CUDA_KERNEL_
  170. if (options.src == 'M') {
  171. touch_managed_src(r_buf, size, window_size);
  172. }
  173. #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
  174. if (i >= options.skip) {
  175. t_end = MPI_Wtime();
  176. t_total += calculate_total(t_start, t_end, t_lo, window_size);
  177. }
  178. } else {
  179. #ifdef _ENABLE_CUDA_KERNEL_
  180. if (options.dst == 'M') {
  181. touch_managed_dst(s_buf, size, window_size);
  182. }
  183. #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
  184. for (j = 0; j < window_size; j++) {
  185. if (options.buf_num == SINGLE) {
  186. MPI_CHECK(MPI_Irecv(r_buf[0], size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
  187. recv_request + j));
  188. } else {
  189. MPI_CHECK(MPI_Irecv(r_buf[j], size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
  190. recv_request + j));
  191. }
  192. }
  193. for (j = 0; j < window_size; j++) {
  194. if (options.buf_num == SINGLE) {
  195. MPI_CHECK(MPI_Isend(s_buf[0], size, MPI_CHAR, 0, 10, MPI_COMM_WORLD,
  196. send_request + j));
  197. } else {
  198. MPI_CHECK(MPI_Isend(s_buf[j], size, MPI_CHAR, 0, 10, MPI_COMM_WORLD,
  199. send_request + j));
  200. }
  201. }
  202. MPI_CHECK(MPI_Waitall(window_size, recv_request, reqstat));
  203. #ifdef _ENABLE_CUDA_KERNEL_
  204. if (options.dst == 'M') {
  205. touch_managed_dst(r_buf, size, window_size);
  206. }
  207. #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
  208. MPI_CHECK(MPI_Waitall(window_size, send_request, reqstat));
  209. }
  210. }
  211. if (myid == 0) {
  212. double tmp = size / 1e6 * options.iterations * window_size * 2;
  213. fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH,
  214. FLOAT_PRECISION, tmp / t_total);
  215. fflush(stdout);
  216. }
  217. if (options.buf_num == MULTIPLE) {
  218. for (i = 0; i < window_size; i++) {
  219. free_memory(s_buf[i], r_buf[i], myid);
  220. }
  221. }
  222. }
  223. if (options.buf_num == SINGLE) {
  224. free_memory(s_buf[0], r_buf[0], myid);
  225. }
  226. free(s_buf);
  227. free(r_buf);
  228. MPI_CHECK(MPI_Finalize());
  229. if (NONE != options.accel) {
  230. if (cleanup_accel()) {
  231. fprintf(stderr, "Error cleaning up device\n");
  232. exit(EXIT_FAILURE);
  233. }
  234. }
  235. return EXIT_SUCCESS;
  236. }
  237. #ifdef _ENABLE_CUDA_KERNEL_
  238. double
  239. measure_kernel_lo(char **buf, int size, int window_size)
  240. {
  241. int i;
  242. double t_lo = 0.0, t_start, t_end;
  243. for (i = 0; i < 10; i++) {
  244. launch_empty_kernel(buf[i%window_size], size);//Warmup
  245. }
  246. for (i = 0; i < 1000; i++) {
  247. t_start = MPI_Wtime();
  248. launch_empty_kernel(buf[i%window_size], size);
  249. synchronize_stream();
  250. t_end = MPI_Wtime();
  251. t_lo = t_lo + (t_end - t_start);
  252. }
  253. t_lo = t_lo/1000;//Averaging the kernel launch overhead
  254. return t_lo;
  255. }
  256. void
  257. touch_managed_src(char **buf, int size, int window_size)
  258. {
  259. int j;
  260. if (options.src == 'M') {
  261. if (options.MMsrc == 'D') {
  262. for (j = 0; j < window_size; j++) {
  263. touch_managed(buf[j], size);
  264. synchronize_stream();
  265. }
  266. } else if ((options.MMsrc == 'H') && size > PREFETCH_THRESHOLD) {
  267. for (j = 0; j < window_size; j++) {
  268. prefetch_data(buf[j], size, -1);
  269. synchronize_stream();
  270. }
  271. } else {
  272. for (j = 0; j < window_size; j++) {
  273. memset(buf[j], 'c', size);
  274. }
  275. }
  276. }
  277. }
  278. void
  279. touch_managed_dst(char **buf, int size, int window_size)
  280. {
  281. int j;
  282. if (options.dst == 'M') {
  283. if (options.MMdst == 'D') {
  284. for (j = 0; j < window_size; j++) {
  285. touch_managed(buf[j], size);
  286. synchronize_stream();
  287. }
  288. } else if ((options.MMdst == 'H') && size > PREFETCH_THRESHOLD) {
  289. for (j = 0; j < window_size; j++) {
  290. prefetch_data(buf[j], size, -1);
  291. synchronize_stream();
  292. }
  293. } else {
  294. for (j = 0; j < window_size; j++) {
  295. memset(buf[j], 'c', size);
  296. }
  297. }
  298. }
  299. }
  300. #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */
  301. double calculate_total(double t_start, double t_end, double t_lo, int window_size)
  302. {
  303. double t_total;
  304. if ((options.src == 'M' && options.MMsrc == 'D') &&
  305. (options.dst == 'M' && options.MMdst == 'D')) {
  306. t_total = ((t_end - t_start) - (2 * t_lo * window_size));
  307. } else if ((options.src == 'M' && options.MMsrc == 'D') ||
  308. (options.dst == 'M' && options.MMdst == 'D')) {
  309. t_total = ((t_end - t_start) - (t_lo * window_size));
  310. } else {
  311. t_total = (t_end - t_start);
  312. }
  313. return t_total;
  314. }