#pragma once #include <string.h> #include "nse-sys.h" #include "nse-alloc.h" #ifndef EXCLUDE_GPU_BRANCH #include "grid-common3d.cuh" #include "cuda-stx.cuh" #endif namespace nse { // * null halo cells * // // ------------------- // template< memType mem = memCPU, typename T > void null_ghost_halo(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz); template< memType mem = memCPU, typename T > void null_halo(T* _RESTRICT x, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke); // * apply -x, -y, -z periodicity * // // -------------------------------- // template< memType mem = memCPU, typename T > void apply_periodic_x(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); template< memType mem = memCPU, typename T > void apply_periodic_y(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); template< memType mem = memCPU, typename T > void apply_periodic_z(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); // * apply -x, -y, -z periodicity - colored * // // ------------------------------------------ // template< memType mem = memCPU, typename T > void apply_periodic_x(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); template< memType mem = memCPU, typename T > void apply_periodic_y(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); template< memType mem = memCPU, typename T > void apply_periodic_z(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); // * get 3d sub array // // ------------------ // template< memType memIN = memCPU, memType memOUT = memCPU, typename T > void get_sub_array(const T* _RESTRICT const in, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, T* _RESTRICT out); // * put 3d sub array // // ------------------ // template< memType memOUT = memCPU, memType memIN = memCPU, typename T > void put_sub_array(T* _RESTRICT out, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, const T* _RESTRICT const in); // * get 3d sub array - colored // // ---------------------------- // template< memType memIN = memCPU, memType memOUT = memCPU, typename T > void get_sub_array(const T* _RESTRICT const in, const int color, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, T* _RESTRICT out); // * put 3d sub array - colored // // ---------------------------- // template< memType memOUT = memCPU, memType memIN = memCPU, typename T > void put_sub_array(T* _RESTRICT out, const int color, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, const T* _RESTRICT const in); // * copy 3d sub array // // ------------------- // template< memType memOUT = memCPU, memType memIN = memCPU, typename T > void copy_sub_array(T* _RESTRICT out, const int nx, const int ny, const int nz, const int posx, const int posy, const int posz, const T* _RESTRICT const in, const int subnx, const int subny, const int subnz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke); // * get number of colored elements // // -------------------------------- // int get_num_colored(const int color, const int ib, const int ie, const int jb, const int je, const int kb, const int ke); } namespace nse { // * null halo cells * // // ------------------- // template< typename T > void null_ghost_halo_omp(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz); template< typename T > void null_halo_omp(T* _RESTRICT x, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke); // * apply -x, -y, -z periodicity * // // -------------------------------- // template< typename T > void apply_periodic_x_omp(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); template< typename T > void apply_periodic_y_omp(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); template< typename T > void apply_periodic_z_omp(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); // * apply -x, -y, -z periodicity - colored * // // ------------------------------------------ // template< typename T > void apply_periodic_x_omp(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); template< typename T > void apply_periodic_y_omp(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); template< typename T > void apply_periodic_z_omp(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz); // * get 3d sub array // // ------------------ // template< typename T > void get_sub_array_omp(const T* _RESTRICT const in, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, T* _RESTRICT out); // * put 3d sub array // // ------------------ // template< typename T > void put_sub_array_omp(T* _RESTRICT out, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, const T* _RESTRICT const in); // * get 3d sub array - colored // // ---------------------------- // template< typename T > void get_sub_array_omp(const T* _RESTRICT const in, const int color, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, T* _RESTRICT out); // * put 3d sub array - colored // // ---------------------------- // template< typename T > void put_sub_array_omp(T* _RESTRICT out, const int color, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, const T* _RESTRICT const in); // * copy 3d sub array // // ------------------- // template< typename T > void copy_sub_array_omp(T* _RESTRICT out, const int nx, const int ny, const int nz, const int posx, const int posy, const int posz, const T* _RESTRICT const in, const int subnx, const int subny, const int subnz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke); } // * implementation: null halo cells * // // ----------------------------------- // template< nse::memType mem, typename T > inline void nse::null_ghost_halo( T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::null_ghost_halo(x, nx, ny, nz, gcx, gcy, gcz); else #endif { // memCPU // if (omp_in_parallel()) { null_ghost_halo_omp(x, nx, ny, nz, gcx, gcy, gcz); } else { #pragma omp parallel shared( x ) { null_ghost_halo_omp(x, nx, ny, nz, gcx, gcy, gcz); } } } } template< typename T > void nse::null_ghost_halo_omp( T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz) { const int nyz = ny * nz; int i, j, k, idx; // -x null halo: west #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = 0; i < gcx; i++) { for (j = 0; j < ny; j++) { idx = i * nyz + j * nz; for (k = 0; k < nz; k++) x[idx + k] = (T)0; } } // -x null halo: east #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = 0; i < gcx; i++) { for (j = 0; j < ny; j++) { idx = (nx - gcx + i) * nyz + j * nz; for (k = 0; k < nz; k++) x[idx + k] = (T)0; } } // -y null halo: south // #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = gcx; i < nx - gcx; i++) { for (j = 0; j < gcy; j++) { idx = i * nyz + j * nz; for (k = 0; k < nz; k++) x[idx + k] = (T)0; } } // -y null halo: north // #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = gcx; i < nx - gcx; i++) { for (j = ny - gcy; j < ny; j++) { idx = i * nyz + j * nz; for (k = 0; k < nz; k++) x[idx + k] = (T)0; } } // -z null halo: bottom // #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = gcx; i < nx - gcx; i++) { for (j = gcy; j < ny - gcy; j++) { idx = i * nyz + j * nz; for (k = 0; k < gcz; k++) x[idx + k] = (T)0; } } // -z null halo: top // #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = gcx; i < nx - gcx; i++) { for (j = gcy; j < ny - gcy; j++) { idx = i * nyz + j * nz; for (k = nz - gcz; k < nz; k++) x[idx + k] = (T)0; } } } // * implementation: null halo cells * // // ----------------------------------- // template< nse::memType mem, typename T > inline void nse::null_halo( T* _RESTRICT x, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::null_halo(x, nx, ny, nz, ib, ie, jb, je, kb, ke); else #endif { // memCPU // if (omp_in_parallel()) { null_halo_omp(x, nx, ny, nz, ib, ie, jb, je, kb, ke); } else { #pragma omp parallel shared( x ) { null_halo_omp(x, nx, ny, nz, ib, ie, jb, je, kb, ke); } } } } template< typename T > void nse::null_halo_omp( T* _RESTRICT x, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke) { const int nyz = ny * nz; int i, j, k, idx; // -x null halo: west #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = 0; i < ib; i++) { for (j = 0; j < ny; j++) { idx = i * nyz + j * nz; for (k = 0; k < nz; k++) x[idx + k] = (T)0; } } // -x null halo: east #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ie + 1; i < nx; i++) { for (j = 0; j < ny; j++) { idx = i * nyz + j * nz; for (k = 0; k < nz; k++) x[idx + k] = (T)0; } } // -y null halo: south // #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { for (j = 0; j < jb; j++) { idx = i * nyz + j * nz; for (k = 0; k < nz; k++) x[idx + k] = (T)0; } } // -y null halo: north // #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { for (j = je + 1; j < ny; j++) { idx = i * nyz + j * nz; for (k = 0; k < nz; k++) x[idx + k] = (T)0; } } // -z null halo: bottom // #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { for (j = jb; j <= je; j++) { idx = i * nyz + j * nz; for (k = 0; k < kb; k++) x[idx + k] = (T)0; } } // -z null halo: top // #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { for (j = jb; j <= je; j++) { idx = i * nyz + j * nz; for (k = ke + 1; k < nz; k++) x[idx + k] = (T)0; } } } // * apply -x, -y, -z periodicity * // // -------------------------------- // template< nse::memType mem, typename T > inline void nse::apply_periodic_x(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::apply_periodic_x(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); else #endif { // memCPU // if (omp_in_parallel()) { apply_periodic_x_omp(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } else { #pragma omp parallel shared( x ) { apply_periodic_x_omp(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } } } } template< nse::memType mem, typename T > inline void nse::apply_periodic_y(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::apply_periodic_y(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); else #endif { // memCPU // if (omp_in_parallel()) { apply_periodic_y_omp(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } else { #pragma omp parallel shared( x ) { apply_periodic_y_omp(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } } } } template< nse::memType mem, typename T > inline void nse::apply_periodic_z(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) nse_gpu::apply_periodic_z(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); else #endif { // memCPU // if (omp_in_parallel()) { apply_periodic_z_omp(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } else { #pragma omp parallel shared( x ) { apply_periodic_z_omp(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } } } } template< typename T > void nse::apply_periodic_x_omp(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { const int nyz = ny * nz; const int stride = (nx - 2 * gcx) * nyz; const int shx = hx * nyz; const int jb = gcy - hy, je = ny - gcy + hy - 1; const int kb = gcz - hz, ke = nz - gcz + hz - 1; const int block_size = (ke - kb + 1) * sizeof(T); int i, j, k, idx; if (block_size < MIN_MEMCPY_BLOCK) { // magic number of inefficient memcpy() #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = gcx - hx; i < gcx; i++) { // west peridocity // for (j = jb; j <= je; j++) { idx = i * nyz + j * nz; for (k = kb; k <= ke; k++) x[idx + k] = x[idx + stride + k]; } } #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = gcx - hx; i < gcx; i++) { // east periodicity // for (j = jb; j <= je; j++) { idx = i * nyz + j * nz + stride + shx; for (k = kb; k <= ke; k++) x[idx + k] = x[idx - stride + k]; } } } else { #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = gcx - hx; i < gcx; i++) { // west peridocity // for (j = jb; j <= je; j++) { idx = i * nyz + j * nz + kb; memcpy(&x[idx], &x[idx + stride], block_size); } } #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = gcx - hx; i < gcx; i++) { // east peridocity // for (j = jb; j <= je; j++) { idx = i * nyz + j * nz + kb + shx; memcpy(&x[idx + stride], &x[idx], block_size); } } } } template< typename T > void nse::apply_periodic_y_omp(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { const int nyz = ny * nz; const int stride = (ny - 2 * gcy) * nz; const int shy = hy * nz; const int ib = gcx - hx, ie = nx - gcx + hx - 1; const int kb = gcz - hz, ke = nz - gcz + hz - 1; const int block_size = (ke - kb + 1) * sizeof(T); int i, j, k, idx; if (block_size < MIN_MEMCPY_BLOCK) { // magic number of inefficient memcpy() #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { // south periodicity // for (j = gcy - hy; j < gcy; j++) { idx = i * nyz + j * nz; for (k = kb; k <= ke; k++) x[idx + k] = x[idx + stride + k]; } } #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { // north periodicity // for (j = gcy - hy; j < gcy; j++) { idx = i * nyz + j * nz + stride + shy; for (k = kb; k <= ke; k++) x[idx + k] = x[idx - stride + k]; } } } else { #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { // south periodicity // for (j = gcy - hy; j < gcy; j++) { idx = i * nyz + j * nz + kb; memcpy(&x[idx], &x[idx + stride], block_size); } } #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { // north periodicity // for (j = gcy - hy; j < gcy; j++) { idx = i * nyz + j * nz + kb + shy; memcpy(&x[idx + stride], &x[idx], block_size); } } } } template< typename T > void nse::apply_periodic_z_omp(T* _RESTRICT x, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { const int nyz = ny * nz; const int stride = (nz - 2 * gcz); const int ib = gcx - hx, ie = nx - gcx + hx - 1; const int jb = gcy - hy, je = ny - gcy + hy - 1; int i, j, k, idx; #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { // bottom periodicity // for (j = jb; j <= je; j++) { idx = i * nyz + j * nz; for (k = gcz - hz; k < gcz; k++) x[idx + k] = x[idx + stride + k]; } } #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { // top periodicity // for (j = jb; j <= je; j++) { idx = i * nyz + j * nz; for (k = nz - gcz; k < nz - gcz + hz; k++) x[idx + k] = x[idx - stride + k]; } } } // * apply -x, -y, -z periodicity - colored * // // ------------------------------------------ // template< nse::memType mem, typename T > inline void nse::apply_periodic_x(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) { nse_gpu::apply_periodic_x(x, color, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } else #endif { // memCPU // if (omp_in_parallel()) { apply_periodic_x_omp(x, color, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } else { #pragma omp parallel shared( x ) { apply_periodic_x_omp(x, color, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } } } } template< nse::memType mem, typename T > inline void nse::apply_periodic_y(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) { nse_gpu::apply_periodic_y(x, color, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } else #endif { // memCPU // if (omp_in_parallel()) { apply_periodic_y_omp(x, color, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } else { #pragma omp parallel shared( x ) { apply_periodic_y_omp(x, color, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } } } } template< nse::memType mem, typename T > inline void nse::apply_periodic_z(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { #ifndef EXCLUDE_GPU_BRANCH if (mem == memGPU) { nse_gpu::apply_periodic_z(x, color, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } else #endif { // memCPU // if (omp_in_parallel()) { apply_periodic_z_omp(x, color, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } else { #pragma omp parallel shared( x ) { apply_periodic_z_omp(x, color, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz); } } } } template< typename T > void nse::apply_periodic_x_omp(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { const int nyz = ny * nz; const int stride = (nx - 2 * gcx) * nyz; const int ish = hx + nx - 2 * gcx; const int shx = hx * nyz; const int jb = gcy - hy, je = ny - gcy + hy - 1; const int kb = gcz - hz, ke = nz - gcz + hz - 1; int i, j, k, idx, csh; #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = gcx - hx; i < gcx; i++) { // west periodicity // for (j = jb; j <= je; j++) { csh = (i + j + kb + color) & 1; idx = i * nyz + j * nz; for (k = kb + csh; k <= ke; k += 2) x[idx + k] = x[idx + stride + k]; } } #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = gcx - hx; i < gcx; i++) { // east periodicity // for (j = jb; j <= je; j++) { csh = (i + ish + j + kb + color) & 1; idx = i * nyz + j * nz + stride + shx; for (k = kb + csh; k <= ke; k += 2) x[idx + k] = x[idx - stride + k]; } } } template< typename T > void nse::apply_periodic_y_omp(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { const int nyz = ny * nz; const int stride = (ny - 2 * gcy) * nz; const int jsh = hy + ny - 2 * gcy; const int shy = hy * nz; const int ib = gcx - hx, ie = nx - gcx + hx - 1; const int kb = gcz - hz, ke = nz - gcz + hz - 1; int i, j, k, idx, csh; #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { // south periodicity // for (j = gcy - hy; j < gcy; j++) { csh = (i + j + kb + color) & 1; idx = i * nyz + j * nz; for (k = kb + csh; k <= ke; k += 2) x[idx + k] = x[idx + stride + k]; } } #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { // north periodicity // for (j = gcy - hy; j < gcy; j++) { csh = (i + j + jsh + kb + color) & 1; idx = i * nyz + j * nz + stride + shy; for (k = kb + csh; k <= ke; k += 2) x[idx + k] = x[idx - stride + k]; } } } template< typename T > void nse::apply_periodic_z_omp(T* _RESTRICT x, const int color, const int nx, const int ny, const int nz, const int gcx, const int gcy, const int gcz, const int hx, const int hy, const int hz) { const int nyz = ny * nz; const int stride = (nz - 2 * gcz); const int ib = gcx - hx, ie = nx - gcx + hx - 1; const int jb = gcy - hy, je = ny - gcy + hy - 1; int i, j, k, idx, csh; #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { // bottom periodicity // for (j = jb; j <= je; j++) { idx = i * nyz + j * nz; csh = (i + j + gcz - hz + color) & 1; for (k = gcz - hz + csh; k < gcz; k += 2) x[idx + k] = x[idx + stride + k]; } } #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { // top periodicity // for (j = jb; j <= je; j++) { idx = i * nyz + j * nz; csh = (i + j + nz - gcz + color) & 1; for (k = nz - gcz + csh; k < nz - gcz + hz; k += 2) x[idx + k] = x[idx - stride + k]; } } } // * implementation: get 3d sub array * // // ------------------------------------ // template< nse::memType memIN, nse::memType memOUT, typename T > inline void nse::get_sub_array(const T* _RESTRICT const in, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, T* _RESTRICT out) { #ifndef EXCLUDE_GPU_BRANCH if (((memIN == memCPU) && (memOUT == memGPU)) || ((memIN == memGPU) && (memOUT == memCPU))) { const int c_size = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1); T* buf; int buf_id; if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size); if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size); get_sub_array<memIN, memIN>(in, nx, ny, nz, ib, ie, jb, je, kb, ke, buf); mcopy<memOUT, memIN>(out, buf, c_size); if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id); if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id); } else if ((memIN == memGPU) && (memOUT == memGPU)) { nse_gpu::get_sub_array(in, nx, ny, nz, ib, ie, jb, je, kb, ke, out); } else #endif { // memCPU -> memCPU // if (omp_in_parallel()) { get_sub_array_omp(in, nx, ny, nz, ib, ie, jb, je, kb, ke, out); } else { #pragma omp parallel shared( out ) { get_sub_array_omp(in, nx, ny, nz, ib, ie, jb, je, kb, ke, out); } } } } template< typename T > void nse::get_sub_array_omp(const T* _RESTRICT const in, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, T* _RESTRICT out) { const int nyz = ny * nz; const int cy = je - jb + 1, cz = ke - kb + 1; const int cyz = cy * cz; const int block_size = cz * sizeof(T); int i, j, k, idx, odx; if (block_size < MIN_MEMCPY_BLOCK) { // magic number of inefficient memcpy() #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { for (j = jb; j <= je; j++) { idx = i * nyz + j * nz; odx = (i - ib) * cyz + (j - jb) * cz - kb; for (k = kb; k <= ke; k++) out[odx + k] = in[idx + k]; } } } else { #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { for (j = jb; j <= je; j++) { idx = i * nyz + j * nz + kb; odx = (i - ib) * cyz + (j - jb) * cz; memcpy(&out[odx], &in[idx], block_size); } } } } // * implementation: put 3d sub array * // // ------------------------------------ // template< nse::memType memOUT, nse::memType memIN, typename T > inline void nse::put_sub_array(T* _RESTRICT out, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, const T* _RESTRICT const in) { #ifndef EXCLUDE_GPU_BRANCH if (((memIN == memCPU) && (memOUT == memGPU)) || ((memIN == memGPU) && (memOUT == memCPU))) { const int c_size = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1); T* buf; int buf_id; if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size); if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size); mcopy<memOUT, memIN>(buf, in, c_size); put_sub_array<memOUT, memOUT>(out, nx, ny, nz, ib, ie, jb, je, kb, ke, buf); if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id); if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id); } else if ((memIN == memGPU) && (memOUT == memGPU)) { nse_gpu::put_sub_array(out, nx, ny, nz, ib, ie, jb, je, kb, ke, in); } else #endif { // memCPU <- memCPU if (omp_in_parallel()) { put_sub_array_omp(out, nx, ny, nz, ib, ie, jb, je, kb, ke, in); } else { #pragma omp parallel shared( out ) { put_sub_array_omp(out, nx, ny, nz, ib, ie, jb, je, kb, ke, in); } } } } template< typename T > void nse::put_sub_array_omp(T* _RESTRICT out, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, const T* _RESTRICT const in) { const int nyz = ny * nz; const int cy = je - jb + 1, cz = ke - kb + 1; const int cyz = cy * cz; const int block_size = cz * sizeof(T); int i, j, k, idx, odx; if (block_size < MIN_MEMCPY_BLOCK) { // magic number of inefficient memcpy() #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { for (j = jb; j <= je; j++) { odx = i * nyz + j * nz; idx = (i - ib) * cyz + (j - jb) * cz - kb; for (k = kb; k <= ke; k++) out[odx + k] = in[idx + k]; } } } else { #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse(2) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { for (j = jb; j <= je; j++) { odx = i * nyz + j * nz + kb; idx = (i - ib) * cyz + (j - jb) * cz; memcpy(&out[odx], &in[idx], block_size); } } } } // * implementation: get 3d sub array - colored * // // ---------------------------------------------- // template< nse::memType memIN, nse::memType memOUT, typename T > void nse::get_sub_array(const T* _RESTRICT const in, const int color, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, T* _RESTRICT out) { #ifndef EXCLUDE_GPU_BRANCH if (((memIN == memCPU) && (memOUT == memGPU)) || ((memIN == memGPU) && (memOUT == memCPU))) { const int c_size = get_num_colored(color, ib, ie, jb, je, kb, ke); T* buf; int buf_id; if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size); if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size); get_sub_array<memIN, memIN>(in, color, nx, ny, nz, ib, ie, jb, je, kb, ke, buf); mcopy<memOUT, memIN>(out, buf, c_size); if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id); if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id); } else if ((memIN == memGPU) && (memOUT == memGPU)) { nse_gpu::get_sub_array(in, color, nx, ny, nz, ib, ie, jb, je, kb, ke, out); } else #endif { // memCPU -> memCPU // #ifdef USE_DEPRECATED_COLOR_CP const int nyz = ny * nz; int i, j, k; int idx, odx = 0; int sh; for (i = ib; i <= ie; i++) { idx = i * nyz + jb * nz; sh = (i + jb + kb + color) & 1; for (j = jb; j <= je; j++, idx += nz) { for (k = kb + sh; k <= ke; k += 2, odx++) { out[odx] = in[idx + k]; } sh = !sh; // change shift for next column // } } #else if (omp_in_parallel()) { get_sub_array_omp(in, color, nx, ny, nz, ib, ie, jb, je, kb, ke, out); } else { #pragma omp parallel shared( out ) { get_sub_array_omp(in, color, nx, ny, nz, ib, ie, jb, je, kb, ke, out); } } #endif } } template< typename T > void nse::get_sub_array_omp(const T* _RESTRICT const in, const int color, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, T* _RESTRICT out) { const int nyz = ny * nz; int i, j, k, idx, odx, sh; // colored plane and column sizes // const int cz = (ke - kb + 1); const int cyz = (je - jb + 1) * cz; const int cyzh = get_num_colored(color, // half-colored-plane size ib, ib, jb, je, kb, ke); #ifdef USE_OPENMP_2D_CYCLE const int czh = get_num_colored(color, // half-colored-column size ib, ib, jb, jb, kb, ke); #endif #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse( 2 ) nowait for (i = ib; i <= ie; i++) { for (j = jb; j <= je; j++) { sh = (i + j + kb + color) & 1; idx = i * nyz + j * nz; odx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh + ((j - jb) >> 1) * cz + ((j - jb) & 1) * czh; #else #pragma omp for nowait for (i = ib; i <= ie; i++) { sh = (i + jb + kb + color) & 1; idx = i * nyz + jb * nz; odx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh; for (j = jb; j <= je; j++, idx += nz) { #endif for (k = kb + sh; k <= ke; k += 2, odx++) { out[odx] = in[idx + k]; } #ifndef USE_OPENMP_2D_CYCLE sh = !sh; // change shift for next column // #endif } } } // * implementation: put 3d sub array - colored * // // ---------------------------------------------- // template< nse::memType memOUT, nse::memType memIN, typename T > void nse::put_sub_array(T* _RESTRICT out, const int color, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, const T* _RESTRICT const in) { #ifndef EXCLUDE_GPU_BRANCH if (((memIN == memCPU) && (memOUT == memGPU)) || ((memIN == memGPU) && (memOUT == memCPU))) { const int c_size = get_num_colored(color, ib, ie, jb, je, kb, ke); T* buf; int buf_id; if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size); if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size); mcopy<memOUT, memIN>(buf, in, c_size); put_sub_array<memOUT, memOUT>(out, color, nx, ny, nz, ib, ie, jb, je, kb, ke, buf); if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id); if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id); } else if ((memIN == memGPU) && (memOUT == memGPU)) { nse_gpu::put_sub_array(out, color, nx, ny, nz, ib, ie, jb, je, kb, ke, in); } else #endif { // memCPU <- memCPU // #ifdef USE_DEPRECATED_COLOR_CP const int nyz = ny * nz; int i, j, k; int odx, idx = 0; int sh; for (i = ib; i <= ie; i++) { odx = i * nyz + jb * nz; sh = (i + jb + kb + color) & 1; for (j = jb; j <= je; j++, odx += nz) { for (k = kb + sh; k <= ke; k += 2, idx++) { out[odx + k] = in[idx]; } sh = !sh; // change shift for next column // } } #else if (omp_in_parallel()) { put_sub_array_omp(out, color, nx, ny, nz, ib, ie, jb, je, kb, ke, in); } else { #pragma omp parallel shared( out ) { put_sub_array_omp(out, color, nx, ny, nz, ib, ie, jb, je, kb, ke, in); } } #endif } } template< typename T > void nse::put_sub_array_omp(T* _RESTRICT out, const int color, const int nx, const int ny, const int nz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke, const T* _RESTRICT const in) { const int nyz = ny * nz; int i, j, k, idx, odx, sh; // colored plane and column sizes // const int cz = (ke - kb + 1); const int cyz = (je - jb + 1) * cz; const int cyzh = get_num_colored(color, // half-colored-plane size ib, ib, jb, je, kb, ke); #ifdef USE_OPENMP_2D_CYCLE const int czh = get_num_colored(color, // half-colored-column size ib, ib, jb, jb, kb, ke); #endif #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse( 2 ) nowait for (i = ib; i <= ie; i++) { for (j = jb; j <= je; j++) { sh = (i + j + kb + color) & 1; odx = i * nyz + j * nz; idx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh + ((j - jb) >> 1) * cz + ((j - jb) & 1) * czh; #else #pragma omp for nowait for (i = ib; i <= ie; i++) { sh = (i + jb + kb + color) & 1; odx = i * nyz + jb * nz; idx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh; for (j = jb; j <= je; j++, odx += nz) { #endif for (k = kb + sh; k <= ke; k += 2, idx++) { out[odx + k] = in[idx]; } #ifndef USE_OPENMP_2D_CYCLE sh = !sh; // change shift for next column // #endif } } } // * implementation: copy 3d sub array * // // ------------------------------------ // template< nse::memType memOUT, nse::memType memIN, typename T > inline void nse::copy_sub_array(T* _RESTRICT out, const int nx, const int ny, const int nz, const int posx, const int posy, const int posz, const T* _RESTRICT const in, const int subnx, const int subny, const int subnz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke) { #ifndef EXCLUDE_GPU_BRANCH if (((memIN == memCPU) && (memOUT == memGPU)) || ((memIN == memGPU) && (memOUT == memCPU))) { const int c_size = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1); T* buf; int buf_id; if (memOUT == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size); if (memOUT == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size); get_sub_array<memIN, memOUT>(in, subnx, subny, subnz, ib, ie, jb, je, kb, ke, buf); put_sub_array<memOUT, memOUT>(out, nx, ny, nz, posx, posx + (ie - ib), posy, posy + (je - jb), posz, posz + (ke - kb), buf); if (memOUT == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id); if (memOUT == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id); } else if ((memIN == memGPU) && (memOUT == memGPU)) { nse_gpu::copy_sub_array(out, nx, ny, nz, posx, posy, posz, in, subnx, subny, subnz, ib, ie, jb, je, kb, ke); } else #endif { // memCPU <- memCPU // if (omp_in_parallel()) { copy_sub_array_omp(out, nx, ny, nz, posx, posy, posz, in, subnx, subny, subnz, ib, ie, jb, je, kb, ke); } else { #pragma omp parallel shared( out ) { copy_sub_array_omp(out, nx, ny, nz, posx, posy, posz, in, subnx, subny, subnz, ib, ie, jb, je, kb, ke); } } } } template< typename T > void nse::copy_sub_array_omp(T* _RESTRICT out, const int nx, const int ny, const int nz, const int posx, const int posy, const int posz, const T* _RESTRICT const in, const int subnx, const int subny, const int subnz, const int ib, const int ie, const int jb, const int je, const int kb, const int ke) { const int nyz = ny * nz; const int subnyz = subny * subnz; const int block_size = (ke - kb + 1) * sizeof(T); const int shx = posx - ib, shy = posy - jb; int i, j, idx, odx; #ifdef USE_OPENMP_2D_CYCLE #pragma omp for collapse( 2 ) nowait #else #pragma omp for nowait #endif for (i = ib; i <= ie; i++) { for (j = jb; j <= je; j++) { odx = (i + shx) * nyz + (j + shy) * nz + posz; idx = i * subnyz + j * subnz + kb; memcpy(&out[odx], &in[idx], block_size); } } } // * implementation: get number of colored elements // // ------------------------------------------------ // inline int nse::get_num_colored(const int color, const int ib, const int ie, const int jb, const int je, const int kb, const int ke) { const int length = ie - ib + 1; const int width = je - jb + 1; const int height = ke - kb + 1; if ((length & 1) && (width & 1) && (height & 1)) { const int sh = !((ib + jb + kb + color) & 1); return ((length - 1) >> 1) * width * height + ((width - 1) >> 1) * height + ((height + sh) >> 1); } else return ((length * width * height) >> 1); }