#pragma once #include <string.h> #include "nse-sys.h" #include "nse-alloc.h" #ifndef EXCLUDE_GPU_BRANCH #include "grid-common2d.cuh" #include "cuda-stx.cuh" #endif namespace nse { // * null halo cells * // // ------------------- // template< memType mem = memCPU, typename T > void null_ghost_halo(T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy); template< memType mem = memCPU, typename T > void null_halo(T* _RESTRICT x, const int nx, const int ny, const int ib, const int ie, const int jb, const int je); // * apply -x, -y periodicity * // // ---------------------------- // template< memType mem = memCPU, typename T > void apply_periodic_x(T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy); template< memType mem = memCPU, typename T > void apply_periodic_y(T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy); // * apply -x, -y periodicity - colored * // // -------------------------------------- // template< memType mem = memCPU, typename T > void apply_periodic_x(T* _RESTRICT x, const int color, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy); template< memType mem = memCPU, typename T > void apply_periodic_y(T* _RESTRICT x, const int color, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy); // * get 2d sub array // // ------------------ // template< memType memIN = memCPU, memType memOUT = memCPU, typename T > void get_sub_array(const T* _RESTRICT const in, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, T* _RESTRICT out); // * put 2d sub array // // ------------------ // template< memType memOUT = memCPU, memType memIN = memCPU, typename T > void put_sub_array(T* _RESTRICT out, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, const T* _RESTRICT const in); // * get 2d sub array - colored // // ---------------------------- // template< memType memIN = memCPU, memType memOUT = memCPU, typename T > void get_sub_array(const T* _RESTRICT const in, const int color, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, T* _RESTRICT out); // * put 2d sub array - colored // // ---------------------------- // template< memType memOUT = memCPU, memType memIN = memCPU, typename T > void put_sub_array(T* _RESTRICT out, const int color, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, const T* _RESTRICT const in); // * copy 2d sub array // // ------------------- // template< memType memOUT = memCPU, memType memIN = memCPU, typename T > void copy_sub_array(T* _RESTRICT out, const int nx, const int ny, const int posx, const int posy, const T* _RESTRICT const in, const int subnx, const int subny, const int ib, const int ie, const int jb, const int je); // * get number of colored elements // // -------------------------------- // int get_num_colored(const int color, const int ib, const int ie, const int jb, const int je); } namespace nse { // * null halo cells * // // ------------------- // template< typename T > void null_ghost_halo_omp(T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy); template< typename T > void null_halo_omp(T* _RESTRICT x, const int nx, const int ny, const int ib, const int ie, const int jb, const int je); // * apply -x, -y periodicity * // // ---------------------------- // template< typename T > void apply_periodic_x_omp(T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy); template< typename T > void apply_periodic_y_omp(T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy); // * apply -x, -y periodicity - colored * // // -------------------------------------- // template< typename T > void apply_periodic_x_omp(T* _RESTRICT x, const int color, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy); template< typename T > void apply_periodic_y_omp(T* _RESTRICT x, const int color, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy); // * get 2d sub array // // ------------------ // template< typename T > void get_sub_array_omp(const T* _RESTRICT const in, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, T* _RESTRICT out); // * put 2d sub array // // ------------------ // template< typename T > void put_sub_array_omp(T* _RESTRICT out, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, const T* _RESTRICT const in); // * get 2d sub array - colored // // ---------------------------- // template< typename T > void get_sub_array_omp(const T* _RESTRICT const in, const int color, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, T* _RESTRICT out); // * put 2d sub array - colored // // ---------------------------- // template< typename T > void put_sub_array_omp(T* _RESTRICT out, const int color, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, const T* _RESTRICT const in); // * copy 2d sub array // // ------------------- // template< typename T > void copy_sub_array_omp(T* _RESTRICT out, const int nx, const int ny, const int posx, const int posy, const T* _RESTRICT const in, const int subnx, const int subny, const int ib, const int ie, const int jb, const int je); } // * implementation: null halo cells * // // ----------------------------------- // template< nse::memType mem, typename T > inline void nse::null_ghost_halo( T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::null_ghost_halo(x, nx, ny, gcx, gcy); else #endif { // memCPU // if (omp_in_parallel()) { null_ghost_halo_omp(x, nx, ny, gcx, gcy); } else { #pragma omp parallel shared( x ) { null_ghost_halo_omp(x, nx, ny, gcx, gcy); } } } } template< typename T > void nse::null_ghost_halo_omp( T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy) { int i, j, idx; // null column: west // #pragma omp for nowait for (i = 0; i < gcx; i++) { idx = i * ny; for (j = 0; j < ny; j++) x[idx + j] = (T)0; } // null column: east // #pragma omp for nowait for (i = 0; i < gcx; i++) { idx = (nx - gcx + i) * ny; for (j = 0; j < ny; j++) { x[idx + j] = (T)0; } } // null rows: south // #pragma omp for nowait for (i = gcx; i < nx - gcx; i++) { idx = i * ny; for (j = 0; j < gcy; j++) x[idx + j] = (T)0; } // null rows: north // #pragma omp for nowait for (i = gcx; i < nx - gcx; i++) { idx = i * ny; for (j = ny - gcy; j < ny; j++) x[idx + j] = (T)0; } } template< nse::memType mem, typename T > inline void nse::null_halo( T* _RESTRICT x, const int nx, const int ny, const int ib, const int ie, const int jb, const int je) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::null_halo(x, nx, ny, ib, ie, jb, je); else #endif { // memCPU // if (omp_in_parallel()) { null_halo_omp(x, nx, ny, ib, ie, jb, je); } else { #pragma omp parallel shared( x ) { null_halo_omp(x, nx, ny, ib, ie, jb, je); } } } } template< typename T > void nse::null_halo_omp( T* _RESTRICT x, const int nx, const int ny, const int ib, const int ie, const int jb, const int je) { int i, j, idx; // null column: west // #pragma omp for nowait for (i = 0; i < ib; i++) { idx = i * ny; for (j = 0; j < ny; j++) x[idx + j] = (T)0; } // null column: east // #pragma omp for nowait for (i = ie + 1; i < nx; i++) { idx = i * ny; for (j = 0; j < ny; j++) { x[idx + j] = (T)0; } } // null rows: south // #pragma omp for nowait for (i = ib; i <= ie; i++) { idx = i * ny; for (j = 0; j < jb; j++) x[idx + j] = (T)0; } // null rows: north // #pragma omp for nowait for (i = ib; i <= ie; i++) { idx = i * ny; for (j = je + 1; j < ny; j++) x[idx + j] = (T)0; } } // * apply -x, -y periodicity * // // ---------------------------- // template< nse::memType mem, typename T > inline void nse::apply_periodic_x(T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::apply_periodic_x(x, nx, ny, gcx, gcy, hx, hy); else #endif { // memCPU // if (omp_in_parallel()) { apply_periodic_x_omp(x, nx, ny, gcx, gcy, hx, hy); } else { #pragma omp parallel shared( x ) { apply_periodic_x_omp(x, nx, ny, gcx, gcy, hx, hy); } } } } template< nse::memType mem, typename T > inline void nse::apply_periodic_y(T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::apply_periodic_y(x, nx, ny, gcx, gcy, hx, hy); else #endif { // memCPU // if (omp_in_parallel()) { apply_periodic_y_omp(x, nx, ny, gcx, gcy, hx, hy); } else { #pragma omp parallel shared( x ) { apply_periodic_y_omp(x, nx, ny, gcx, gcy, hx, hy); } } } } template< typename T > void nse::apply_periodic_x_omp(T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy) { const int stride = (nx - 2 * gcx) * ny; const int shx = hx * ny; const int jb = gcy - hy, je = ny - gcy + hy - 1; const int block_size = (je - jb + 1) * sizeof(T); int i, j, idx; if (block_size < MIN_MEMCPY_BLOCK) { // magic number of inefficient memcpy() #pragma omp for nowait for (i = gcx - hx; i < gcx; i++) { // west periodicity // idx = i * ny; for (j = jb; j <= je; j++) x[idx + j] = x[idx + stride + j]; } #pragma omp for nowait for (i = gcx - hx; i < gcx; i++) { // east periodicity // idx = i * ny + stride + shx; for (j = jb; j <= je; j++) x[idx + j] = x[idx - stride + j]; } } else { #pragma omp for nowait for (i = gcx - hx; i < gcx; i++) { // west periodicity // idx = i * ny + jb; memcpy(&x[idx], &x[idx + stride], block_size); } #pragma omp for nowait for (i = gcx - hx; i < gcx; i++) { // east periodicity // idx = i * ny + jb + shx; memcpy(&x[idx + stride], &x[idx], block_size); } } } template< typename T > void nse::apply_periodic_y_omp(T* _RESTRICT x, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy) { const int stride = (ny - 2 * gcy); const int ib = gcx - hx, ie = nx - gcx + hx - 1; int i, j, idx; #pragma omp for nowait for (i = ib; i <= ie; i++) { // south periodicity // idx = i * ny; for (j = gcy - hy; j < gcy; j++) x[idx + j] = x[idx + stride + j]; } #pragma omp for nowait for (i = ib; i <= ie; i++) { // north periodicity // idx = i * ny; for (j = ny - gcy; j < ny - gcy + hy; j++) x[idx + j] = x[idx - stride + j]; } } // * apply -x, -y periodicity - colored * // // -------------------------------------- // template< nse::memType mem, typename T > inline void nse::apply_periodic_x(T* _RESTRICT x, const int color, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::apply_periodic_x(x, color, nx, ny, gcx, gcy, hx, hy); else #endif { // memCPU // if (omp_in_parallel()) { apply_periodic_x_omp(x, color, nx, ny, gcx, gcy, hx, hy); } else { #pragma omp parallel shared( x ) { apply_periodic_x_omp(x, color, nx, ny, gcx, gcy, hx, hy); } } } } template< nse::memType mem, typename T > inline void nse::apply_periodic_y(T* _RESTRICT x, const int color, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::apply_periodic_y(x, color, nx, ny, gcx, gcy, hx, hy); else #endif { // memCPU // if (omp_in_parallel()) { apply_periodic_y_omp(x, color, nx, ny, gcx, gcy, hx, hy); } else { #pragma omp parallel shared( x ) { apply_periodic_y_omp(x, color, nx, ny, gcx, gcy, hx, hy); } } } } template< typename T > void nse::apply_periodic_x_omp(T* _RESTRICT x, const int color, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy) { const int stride = (nx - 2 * gcx) * ny; const int ish = hx + nx - 2 * gcx; const int shx = hx * ny; const int jb = gcy - hy, je = ny - gcy + hy - 1; int i, j, idx, csh; #pragma omp for nowait for (i = gcx - hx; i < gcx; i++) { // west periodicity // csh = (i + jb + color) & 1; idx = i * ny; for (j = jb + csh; j <= je; j += 2) x[idx + j] = x[idx + stride + j]; } #pragma omp for nowait for (i = gcx - hx; i < gcx; i++) { // east periodicity // csh = (i + ish + jb + color) & 1; idx = i * ny + stride + shx; for (j = jb + csh; j <= je; j += 2) x[idx + j] = x[idx - stride + j]; } } template< typename T > void nse::apply_periodic_y_omp(T* _RESTRICT x, const int color, const int nx, const int ny, const int gcx, const int gcy, const int hx, const int hy) { const int stride = (ny - 2 * gcy); const int ib = gcx - hx, ie = nx - gcx + hx - 1; int i, j, idx, csh; #pragma omp for nowait for (i = ib; i <= ie; i++) { // south periodicity // idx = i * ny; csh = (i + gcy - hy + color) & 1; for (j = gcy - hy + csh; j < gcy; j += 2) x[idx + j] = x[idx + stride + j]; } #pragma omp for nowait for (i = ib; i <= ie; i++) { // north periodicity // idx = i * ny; csh = (i + ny - gcy + color) & 1; for (j = ny - gcy + csh; j < ny - gcy + hy; j += 2) x[idx + j] = x[idx - stride + j]; } } // * implementation: get 2d sub array * // // ------------------------------------ // template< nse::memType memIN, nse::memType memOUT, typename T > inline void nse::get_sub_array(const T* _RESTRICT const in, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, T* _RESTRICT out) { #ifndef EXCLUDE_GPU_BRANCH if (((memIN == memCPU) && (memOUT == memGPU)) || ((memIN == memGPU) && (memOUT == memCPU))) { const int c_size = (ie - ib + 1) * (je - jb + 1); T* buf; int buf_id; if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size); if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size); get_sub_array<memIN, memIN>(in, nx, ny, ib, ie, jb, je, buf); mcopy<memOUT, memIN>(out, buf, c_size); if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id); if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id); } else if ((memIN == memGPU) && (memOUT == memGPU)) { nse_gpu::get_sub_array(in, nx, ny, ib, ie, jb, je, out); } else #endif { // memCPU -> memCPU // if (omp_in_parallel()) { get_sub_array_omp(in, nx, ny, ib, ie, jb, je, out); } else { #pragma omp parallel shared( out ) { get_sub_array_omp(in, nx, ny, ib, ie, jb, je, out); } } } } template< typename T > void nse::get_sub_array_omp(const T* _RESTRICT const in, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, T* _RESTRICT out) { const int cy = je - jb + 1; const int block_size = cy * sizeof(T); int i, j, idx, odx; if (block_size < MIN_MEMCPY_BLOCK) { // magic number of inefficient memcpy() #pragma omp for nowait for (i = ib; i <= ie; i++) { idx = i * ny; odx = (i - ib) * cy - jb; for (j = jb; j <= je; j++) out[odx + j] = in[idx + j]; } } else { #pragma omp for nowait for (i = ib; i <= ie; i++) { idx = i * ny + jb; odx = (i - ib) * cy; memcpy(&out[odx], &in[idx], block_size); } } } // * implementation: put 2d sub array * // // ------------------------------------ // template< nse::memType memOUT, nse::memType memIN, typename T > inline void nse::put_sub_array(T* _RESTRICT out, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, const T* _RESTRICT const in) { #ifndef EXCLUDE_GPU_BRANCH if (((memIN == memCPU) && (memOUT == memGPU)) || ((memIN == memGPU) && (memOUT == memCPU))) { const int c_size = (ie - ib + 1) * (je - jb + 1); T* buf; int buf_id; if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size); if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size); mcopy<memOUT, memIN>(buf, in, c_size); put_sub_array<memOUT, memOUT>(out, nx, ny, ib, ie, jb, je, buf); if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id); if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id); } else if ((memIN == memGPU) && (memOUT == memGPU)) { nse_gpu::put_sub_array(out, nx, ny, ib, ie, jb, je, in); } else #endif { // memCPU <- memCPU // if (omp_in_parallel()) { put_sub_array_omp(out, nx, ny, ib, ie, jb, je, in); } else { #pragma omp parallel shared( out ) { put_sub_array_omp(out, nx, ny, ib, ie, jb, je, in); } } } } template< typename T > void nse::put_sub_array_omp(T* _RESTRICT out, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, const T* _RESTRICT const in) { const int cy = je - jb + 1; const int block_size = cy * sizeof(T); int i, j, idx, odx; if (block_size < MIN_MEMCPY_BLOCK) { // magic number of inefficient memcpy() #pragma omp for nowait for (i = ib; i <= ie; i++) { idx = (i - ib) * cy - jb; odx = i * ny; for (j = jb; j <= je; j++) out[odx + j] = in[idx + j]; } } else { #pragma omp for nowait for (i = ib; i <= ie; i++) { idx = (i - ib) * cy; odx = i * ny + jb; memcpy(&out[odx], &in[idx], block_size); } } } // * implementation: get 2d sub array - colored * // // ---------------------------------------------- // template< nse::memType memIN, nse::memType memOUT, typename T > void nse::get_sub_array(const T* _RESTRICT const in, const int color, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, T* _RESTRICT out) { #ifndef EXCLUDE_GPU_BRANCH if (((memIN == memCPU) && (memOUT == memGPU)) || ((memIN == memGPU) && (memOUT == memCPU))) { const int c_size = get_num_colored(color, ib, ie, jb, je); T* buf; int buf_id; if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size); if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size); get_sub_array<memIN, memIN>(in, color, nx, ny, ib, ie, jb, je, buf); mcopy<memOUT, memIN>(out, buf, c_size); if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id); if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id); } else if ((memIN == memGPU) && (memOUT == memGPU)) { nse_gpu::get_sub_array(in, color, nx, ny, ib, ie, jb, je, out); } else #endif { // memCPU -> memCPU // #ifdef USE_DEPRECATED_COLOR_CP int i, j; int idx, odx = 0; int sh = (ib + jb + color) & 1; for (i = ib; i <= ie; i++) { idx = i * ny + jb + sh; for (j = jb + sh; j <= je; j += 2, odx++, idx += 2) { out[odx] = in[idx]; } sh = !sh; // change shift for next column // } #else if (omp_in_parallel()) { get_sub_array_omp(in, color, nx, ny, ib, ie, jb, je, out); } else { #pragma omp parallel shared( out ) { get_sub_array_omp(in, color, nx, ny, ib, ie, jb, je, out); } } #endif } } template< typename T > void nse::get_sub_array_omp(const T* _RESTRICT const in, const int color, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, T* _RESTRICT out) { int i, j, idx, odx, sh; // colored column size // const int cy = (je - jb + 1); const int cyh = get_num_colored(color, // half-colored-column size ib, ib, jb, je); #pragma omp for nowait for (i = ib; i <= ie; i++) { sh = (i + jb + color) & 1; idx = i * ny; odx = ((i - ib) >> 1) * cy + ((i - ib) & 1) * cyh; for (j = jb + sh; j <= je; j += 2, odx++) { out[odx] = in[idx + j]; } } } // * implementation: put 2d sub array - colored * // // ---------------------------------------------- // template< nse::memType memOUT, nse::memType memIN, typename T > void nse::put_sub_array(T* _RESTRICT out, const int color, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, const T* _RESTRICT const in) { #ifndef EXCLUDE_GPU_BRANCH if (((memIN == memCPU) && (memOUT == memGPU)) || ((memIN == memGPU) && (memOUT == memCPU))) { const int c_size = get_num_colored(color, ib, ie, jb, je); T* buf; int buf_id; if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size); if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size); mcopy<memOUT, memIN>(buf, in, c_size); put_sub_array<memOUT, memOUT>(out, color, nx, ny, ib, ie, jb, je, buf); if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id); if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id); } else if ((memIN == memGPU) && (memOUT == memGPU)) { nse_gpu::put_sub_array(out, color, nx, ny, ib, ie, jb, je, in); } else #endif { // memCPU <- memCPU // #ifdef USE_DEPRECATED_COLOR_CP int i, j; int idx, odx = 0; int sh = (ib + jb + color) & 1; for (i = ib; i <= ie; i++) { idx = i * ny + jb + sh; for (j = jb + sh; j <= je; j += 2, odx++, idx += 2) { out[idx] = in[odx]; } sh = !sh; // change shift for next column // } #else if (omp_in_parallel()) { put_sub_array_omp(out, color, nx, ny, ib, ie, jb, je, in); } else { #pragma omp parallel shared( out ) { put_sub_array_omp(out, color, nx, ny, ib, ie, jb, je, in); } } #endif } } template< typename T > void nse::put_sub_array_omp(T* _RESTRICT out, const int color, const int nx, const int ny, const int ib, const int ie, const int jb, const int je, const T* _RESTRICT const in) { int i, j, idx, odx, sh; // colored column size // const int cy = (je - jb + 1); const int cyh = get_num_colored(color, // half-colored-column size ib, ib, jb, je); #pragma omp for nowait for (i = ib; i <= ie; i++) { sh = (i + jb + color) & 1; idx = ((i - ib) >> 1) * cy + ((i - ib) & 1) * cyh; odx = i * ny; for (j = jb + sh; j <= je; j += 2, idx++) { out[odx + j] = in[idx]; } } } // * implementation: copy 2d sub array * // // ------------------------------------ // template< nse::memType memOUT, nse::memType memIN, typename T > inline void nse::copy_sub_array(T* _RESTRICT out, const int nx, const int ny, const int posx, const int posy, const T* _RESTRICT const in, const int subnx, const int subny, const int ib, const int ie, const int jb, const int je) { #ifndef EXCLUDE_GPU_BRANCH if (((memIN == memCPU) && (memOUT == memGPU)) || ((memIN == memGPU) && (memOUT == memCPU))) { const int c_size = (ie - ib + 1) * (je - jb + 1); T* buf; int buf_id; if (memOUT == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size); if (memOUT == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size); get_sub_array<memIN, memOUT>(in, subnx, subny, ib, ie, jb, je, buf); put_sub_array<memOUT, memOUT>(out, nx, ny, posx, posx + (ie - ib), posy, posy + (je - jb), buf); if (memOUT == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id); if (memOUT == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id); } else if ((memIN == memGPU) && (memOUT == memGPU)) { nse_gpu::copy_sub_array(out, nx, ny, posx, posy, in, subnx, subny, ib, ie, jb, je); } else #endif { // memCPU <- memCPU // if (omp_in_parallel()) { copy_sub_array_omp(out, nx, ny, posx, posy, in, subnx, subny, ib, ie, jb, je); } else { #pragma omp parallel shared( out ) { copy_sub_array_omp(out, nx, ny, posx, posy, in, subnx, subny, ib, ie, jb, je); } } } } template< typename T > void nse::copy_sub_array_omp(T* _RESTRICT out, const int nx, const int ny, const int posx, const int posy, const T* _RESTRICT const in, const int subnx, const int subny, const int ib, const int ie, const int jb, const int je) { const int block_size = (je - jb + 1) * sizeof(T); const int shx = posx - ib; int i, idx, odx; #pragma omp for nowait for (i = ib; i <= ie; i++) { odx = (i + shx) * ny + posy; idx = i * subny + jb; memcpy(&out[odx], &in[idx], block_size); } } // * implementation: get number of colored elements // // ------------------------------------------------ // inline int nse::get_num_colored(const int color, const int ib, const int ie, const int jb, const int je) { const int length = ie - ib + 1; const int width = je - jb + 1; if ((length & 1) && (width & 1)) { const int sh = !((ib + jb + color) & 1); return ((length - 1) >> 1) * width + ((width + sh) >> 1); } else return ((length * width) >> 1); }