#pragma once

#include <string.h>
#include "nse-sys.h"
#include "nse-alloc.h"

#ifndef EXCLUDE_GPU_BRANCH
#include "grid-common2d.cuh"
#include "cuda-stx.cuh"
#endif

namespace nse
{
	// * null halo cells * //
	// ------------------- //
	template< memType mem = memCPU, typename T >
	void null_ghost_halo(T* _RESTRICT x,
		const int nx, const int ny,
		const int gcx, const int gcy);

	template< memType mem = memCPU, typename T >
	void null_halo(T* _RESTRICT x,
		const int nx, const int ny,
		const int ib, const int ie, const int jb, const int je);

	// * apply -x, -y periodicity * //
	// ---------------------------- //
	template< memType mem = memCPU, typename T >
	void apply_periodic_x(T* _RESTRICT x, const int nx, const int ny,
		const int gcx, const int gcy, const int hx, const int hy);
	template< memType mem = memCPU, typename T >
	void apply_periodic_y(T* _RESTRICT x, const int nx, const int ny,
		const int gcx, const int gcy, const int hx, const int hy);

	// * apply -x, -y periodicity - colored * //
	// -------------------------------------- //
	template< memType mem = memCPU, typename T >
	void apply_periodic_x(T* _RESTRICT x, const int color,
		const int nx, const int ny,
		const int gcx, const int gcy, const int hx, const int hy);
	template< memType mem = memCPU, typename T >
	void apply_periodic_y(T* _RESTRICT x, const int color,
		const int nx, const int ny,
		const int gcx, const int gcy, const int hx, const int hy);

	// * get 2d sub array //
	// ------------------ //
	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
	void get_sub_array(const T* _RESTRICT const in,
		const int nx, const int ny,
		const int ib, const int ie, const int jb, const int je,
		T* _RESTRICT out);

	// * put 2d sub array //
	// ------------------ //
	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
	void put_sub_array(T* _RESTRICT out,
		const int nx, const int ny,
		const int ib, const int ie, const int jb, const int je,
		const T* _RESTRICT const in);

	// * get 2d sub array - colored //
	// ---------------------------- //
	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
	void get_sub_array(const T* _RESTRICT const in, const int color,
		const int nx, const int ny,
		const int ib, const int ie, const int jb, const int je,
		T* _RESTRICT out);

	// * put 2d sub array - colored //
	// ---------------------------- //
	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
	void put_sub_array(T* _RESTRICT out, const int color,
		const int nx, const int ny,
		const int ib, const int ie, const int jb, const int je,
		const T* _RESTRICT const in);

	// * copy 2d sub array //
	// ------------------- //
	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
	void copy_sub_array(T* _RESTRICT out,
		const int nx, const int ny,
		const int posx, const int posy,
		const T* _RESTRICT const in,
		const int subnx, const int subny,
		const int ib, const int ie,
		const int jb, const int je);


	// * get number of colored elements //
	// -------------------------------- //
	int get_num_colored(const int color,
		const int ib, const int ie, const int jb, const int je);
}

namespace nse
{
	// * null halo cells * //
	// ------------------- //
	template< typename T >
	void null_ghost_halo_omp(T* _RESTRICT x,
		const int nx, const int ny,
		const int gcx, const int gcy);

	template< typename T >
	void null_halo_omp(T* _RESTRICT x,
		const int nx, const int ny,
		const int ib, const int ie, const int jb, const int je);

	// * apply -x, -y periodicity * //
	// ---------------------------- //
	template< typename T >
	void apply_periodic_x_omp(T* _RESTRICT x, const int nx, const int ny,
		const int gcx, const int gcy, const int hx, const int hy);
	template< typename T >
	void apply_periodic_y_omp(T* _RESTRICT x, const int nx, const int ny,
		const int gcx, const int gcy, const int hx, const int hy);

	// * apply -x, -y periodicity - colored * //
	// -------------------------------------- //
	template< typename T >
	void apply_periodic_x_omp(T* _RESTRICT x, const int color,
		const int nx, const int ny,
		const int gcx, const int gcy, const int hx, const int hy);
	template< typename T >
	void apply_periodic_y_omp(T* _RESTRICT x, const int color,
		const int nx, const int ny,
		const int gcx, const int gcy, const int hx, const int hy);


	// * get 2d sub array //
	// ------------------ //
	template< typename T >
	void get_sub_array_omp(const T* _RESTRICT const in,
		const int nx, const int ny,
		const int ib, const int ie, const int jb, const int je,
		T* _RESTRICT out);

	// * put 2d sub array //
	// ------------------ //
	template< typename T >
	void put_sub_array_omp(T* _RESTRICT out,
		const int nx, const int ny,
		const int ib, const int ie, const int jb, const int je,
		const T* _RESTRICT const in);

	// * get 2d sub array - colored //
	// ---------------------------- //
	template< typename T >
	void get_sub_array_omp(const T* _RESTRICT const in, const int color,
		const int nx, const int ny,
		const int ib, const int ie, const int jb, const int je,
		T* _RESTRICT out);

	// * put 2d sub array - colored //
	// ---------------------------- //
	template< typename T >
	void put_sub_array_omp(T* _RESTRICT out, const int color,
		const int nx, const int ny,
		const int ib, const int ie, const int jb, const int je,
		const T* _RESTRICT const in);

	// * copy 2d sub array //
	// ------------------- //
	template< typename T >
	void copy_sub_array_omp(T* _RESTRICT out,
		const int nx, const int ny,
		const int posx, const int posy,
		const T* _RESTRICT const in,
		const int subnx, const int subny,
		const int ib, const int ie,
		const int jb, const int je);
}


// * implementation: null halo cells * //
// ----------------------------------- //
template< nse::memType mem, typename T >
inline void nse::null_ghost_halo(
	T* _RESTRICT x,
	const int nx, const int ny,
	const int gcx, const int gcy)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU)
		nse_gpu::null_ghost_halo(x, nx, ny, gcx, gcy);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			null_ghost_halo_omp(x, nx, ny, gcx, gcy);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				null_ghost_halo_omp(x, nx, ny, gcx, gcy);
			}
		}
	}
}

template< typename T >
void nse::null_ghost_halo_omp(
	T* _RESTRICT x,
	const int nx, const int ny,
	const int gcx, const int gcy)
{
	int i, j, idx;

	// null column: west //
#pragma omp for nowait
	for (i = 0; i < gcx; i++) {
		idx = i * ny;
		for (j = 0; j < ny; j++)
			x[idx + j] = (T)0;
	}

	// null column: east //
#pragma omp for nowait
	for (i = 0; i < gcx; i++) {
		idx = (nx - gcx + i) * ny;
		for (j = 0; j < ny; j++) {
			x[idx + j] = (T)0;
		}
	}

	// null rows: south //
#pragma omp for nowait
	for (i = gcx; i < nx - gcx; i++) {
		idx = i * ny;
		for (j = 0; j < gcy; j++)
			x[idx + j] = (T)0;
	}

	// null rows: north //
#pragma omp for nowait
	for (i = gcx; i < nx - gcx; i++) {
		idx = i * ny;
		for (j = ny - gcy; j < ny; j++)
			x[idx + j] = (T)0;
	}
}

template< nse::memType mem, typename T >
inline void nse::null_halo(
	T* _RESTRICT x,
	const int nx, const int ny,
	const int ib, const int ie, const int jb, const int je)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU) nse_gpu::null_halo(x, nx, ny, ib, ie, jb, je);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			null_halo_omp(x, nx, ny, ib, ie, jb, je);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				null_halo_omp(x, nx, ny, ib, ie, jb, je);
			}
		}
	}
}

template< typename T >
void nse::null_halo_omp(
	T* _RESTRICT x,
	const int nx, const int ny,
	const int ib, const int ie, const int jb, const int je)
{
	int i, j, idx;

	// null column: west //
#pragma omp for nowait
	for (i = 0; i < ib; i++) {
		idx = i * ny;
		for (j = 0; j < ny; j++)
			x[idx + j] = (T)0;
	}

	// null column: east //
#pragma omp for nowait
	for (i = ie + 1; i < nx; i++) {
		idx = i * ny;
		for (j = 0; j < ny; j++) {
			x[idx + j] = (T)0;
		}
	}

	// null rows: south //
#pragma omp for nowait
	for (i = ib; i <= ie; i++) {
		idx = i * ny;
		for (j = 0; j < jb; j++)
			x[idx + j] = (T)0;
	}

	// null rows: north //
#pragma omp for nowait
	for (i = ib; i <= ie; i++) {
		idx = i * ny;
		for (j = je + 1; j < ny; j++)
			x[idx + j] = (T)0;
	}
}

// * apply -x, -y periodicity * //
// ---------------------------- //
template< nse::memType mem, typename T >
inline void nse::apply_periodic_x(T* _RESTRICT x, const int nx, const int ny,
	const int gcx, const int gcy,
	const int hx, const int hy)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU)
		nse_gpu::apply_periodic_x(x, nx, ny, gcx, gcy, hx, hy);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			apply_periodic_x_omp(x, nx, ny, gcx, gcy, hx, hy);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				apply_periodic_x_omp(x, nx, ny, gcx, gcy, hx, hy);
			}
		}
	}
}

template< nse::memType mem, typename T >
inline void nse::apply_periodic_y(T* _RESTRICT x, const int nx, const int ny,
	const int gcx, const int gcy,
	const int hx, const int hy)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU)
		nse_gpu::apply_periodic_y(x, nx, ny, gcx, gcy, hx, hy);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			apply_periodic_y_omp(x, nx, ny, gcx, gcy, hx, hy);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				apply_periodic_y_omp(x, nx, ny, gcx, gcy, hx, hy);
			}
		}
	}
}

template< typename T >
void nse::apply_periodic_x_omp(T* _RESTRICT x, const int nx, const int ny,
	const int gcx, const int gcy,
	const int hx, const int hy)
{
	const int stride = (nx - 2 * gcx) * ny;
	const int shx = hx * ny;
	const int jb = gcy - hy, je = ny - gcy + hy - 1;
	const int block_size = (je - jb + 1) * sizeof(T);

	int i, j, idx;

	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
#pragma omp for nowait
		for (i = gcx - hx; i < gcx; i++) {	// west periodicity //
			idx = i * ny;
			for (j = jb; j <= je; j++)
				x[idx + j] = x[idx + stride + j];
		}


#pragma omp for nowait
		for (i = gcx - hx; i < gcx; i++) {	// east periodicity //
			idx = i * ny + stride + shx;
			for (j = jb; j <= je; j++)
				x[idx + j] = x[idx - stride + j];
		}
	}
	else
	{

#pragma omp for nowait
		for (i = gcx - hx; i < gcx; i++) {	// west periodicity //
			idx = i * ny + jb;
			memcpy(&x[idx], &x[idx + stride], block_size);
		}

#pragma omp for nowait
		for (i = gcx - hx; i < gcx; i++) {	// east periodicity //
			idx = i * ny + jb + shx;
			memcpy(&x[idx + stride], &x[idx], block_size);
		}
	}
}
template< typename T >
void nse::apply_periodic_y_omp(T* _RESTRICT x, const int nx, const int ny,
	const int gcx, const int gcy,
	const int hx, const int hy)
{
	const int stride = (ny - 2 * gcy);
	const int ib = gcx - hx, ie = nx - gcx + hx - 1;

	int i, j, idx;

#pragma omp for nowait
	for (i = ib; i <= ie; i++) {	// south periodicity //
		idx = i * ny;
		for (j = gcy - hy; j < gcy; j++)
			x[idx + j] = x[idx + stride + j];
	}

#pragma omp for nowait
	for (i = ib; i <= ie; i++) {	// north periodicity //
		idx = i * ny;
		for (j = ny - gcy; j < ny - gcy + hy; j++)
			x[idx + j] = x[idx - stride + j];
	}
}

// * apply -x, -y periodicity - colored * //
// -------------------------------------- //
template< nse::memType mem, typename T >
inline void nse::apply_periodic_x(T* _RESTRICT x, const int color,
	const int nx, const int ny,
	const int gcx, const int gcy,
	const int hx, const int hy)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU)
		nse_gpu::apply_periodic_x(x, color, nx, ny, gcx, gcy, hx, hy);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			apply_periodic_x_omp(x, color, nx, ny, gcx, gcy, hx, hy);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				apply_periodic_x_omp(x, color, nx, ny, gcx, gcy, hx, hy);
			}
		}
	}
}
template< nse::memType mem, typename T >
inline void nse::apply_periodic_y(T* _RESTRICT x, const int color,
	const int nx, const int ny,
	const int gcx, const int gcy,
	const int hx, const int hy)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU)
		nse_gpu::apply_periodic_y(x, color, nx, ny, gcx, gcy, hx, hy);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			apply_periodic_y_omp(x, color, nx, ny, gcx, gcy, hx, hy);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				apply_periodic_y_omp(x, color, nx, ny, gcx, gcy, hx, hy);
			}
		}
	}
}

template< typename T >
void nse::apply_periodic_x_omp(T* _RESTRICT x, const int color,
	const int nx, const int ny,
	const int gcx, const int gcy,
	const int hx, const int hy)
{
	const int stride = (nx - 2 * gcx) * ny;
	const int ish = hx + nx - 2 * gcx;
	const int shx = hx * ny;
	const int jb = gcy - hy, je = ny - gcy + hy - 1;

	int i, j, idx, csh;

#pragma omp for nowait
	for (i = gcx - hx; i < gcx; i++) {	// west periodicity //
		csh = (i + jb + color) & 1;
		idx = i * ny;
		for (j = jb + csh; j <= je; j += 2)
			x[idx + j] = x[idx + stride + j];
	}

#pragma omp for nowait
	for (i = gcx - hx; i < gcx; i++) {	// east periodicity //
		csh = (i + ish + jb + color) & 1;
		idx = i * ny + stride + shx;
		for (j = jb + csh; j <= je; j += 2)
			x[idx + j] = x[idx - stride + j];
	}
}
template< typename T >
void nse::apply_periodic_y_omp(T* _RESTRICT x, const int color,
	const int nx, const int ny,
	const int gcx, const int gcy,
	const int hx, const int hy)
{
	const int stride = (ny - 2 * gcy);
	const int ib = gcx - hx, ie = nx - gcx + hx - 1;

	int i, j, idx, csh;

#pragma omp for nowait
	for (i = ib; i <= ie; i++) {	// south periodicity //
		idx = i * ny;
		csh = (i + gcy - hy + color) & 1;
		for (j = gcy - hy + csh; j < gcy; j += 2)
			x[idx + j] = x[idx + stride + j];

	}

#pragma omp for nowait
	for (i = ib; i <= ie; i++) {	// north periodicity //
		idx = i * ny;
		csh = (i + ny - gcy + color) & 1;
		for (j = ny - gcy + csh; j < ny - gcy + hy; j += 2)
			x[idx + j] = x[idx - stride + j];

	}
}

// * implementation: get 2d sub array * //
// ------------------------------------ //
template< nse::memType memIN, nse::memType memOUT, typename T >
inline void nse::get_sub_array(const T* _RESTRICT const in,
	const int nx, const int ny,
	const int ib, const int ie, const int jb, const int je,
	T* _RESTRICT out)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (((memIN == memCPU) && (memOUT == memGPU)) ||
		((memIN == memGPU) && (memOUT == memCPU)))
	{
		const int c_size = (ie - ib + 1) * (je - jb + 1);

		T* buf;
		int buf_id;
		if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
		if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);

		get_sub_array<memIN, memIN>(in, nx, ny, ib, ie, jb, je, buf);
		mcopy<memOUT, memIN>(out, buf, c_size);

		if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
		if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
	}
	else
		if ((memIN == memGPU) && (memOUT == memGPU)) {
			nse_gpu::get_sub_array(in, nx, ny, ib, ie, jb, je, out);
		}
		else
#endif
		{	// memCPU -> memCPU //
			if (omp_in_parallel()) {
				get_sub_array_omp(in, nx, ny, ib, ie, jb, je, out);
			}
			else
			{
#pragma omp parallel shared( out ) 
				{
					get_sub_array_omp(in, nx, ny, ib, ie, jb, je, out);
				}
			}
		}
}

template< typename T >
void nse::get_sub_array_omp(const T* _RESTRICT const in,
	const int nx, const int ny,
	const int ib, const int ie, const int jb, const int je,
	T* _RESTRICT out)
{
	const int cy = je - jb + 1;
	const int block_size = cy * sizeof(T);

	int i, j, idx, odx;

	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()

#pragma omp for nowait
		for (i = ib; i <= ie; i++)
		{
			idx = i * ny;
			odx = (i - ib) * cy - jb;
			for (j = jb; j <= je; j++)
				out[odx + j] = in[idx + j];
		}
	}
	else
	{
#pragma omp for nowait
		for (i = ib; i <= ie; i++) {
			idx = i * ny + jb;
			odx = (i - ib) * cy;
			memcpy(&out[odx], &in[idx], block_size);
		}
	}
}

// * implementation: put 2d sub array * //
// ------------------------------------ //
template< nse::memType memOUT, nse::memType memIN, typename T >
inline void nse::put_sub_array(T* _RESTRICT out,
	const int nx, const int ny,
	const int ib, const int ie, const int jb, const int je,
	const T* _RESTRICT const in)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (((memIN == memCPU) && (memOUT == memGPU)) ||
		((memIN == memGPU) && (memOUT == memCPU)))
	{
		const int c_size = (ie - ib + 1) * (je - jb + 1);

		T* buf;
		int buf_id;
		if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
		if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);

		mcopy<memOUT, memIN>(buf, in, c_size);
		put_sub_array<memOUT, memOUT>(out, nx, ny, ib, ie, jb, je, buf);

		if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
		if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id);
	}
	else
		if ((memIN == memGPU) && (memOUT == memGPU)) {
			nse_gpu::put_sub_array(out, nx, ny, ib, ie, jb, je, in);
		}
		else
#endif
		{	// memCPU <- memCPU //
			if (omp_in_parallel()) {
				put_sub_array_omp(out, nx, ny, ib, ie, jb, je, in);
			}
			else
			{
#pragma omp parallel shared( out ) 
				{
					put_sub_array_omp(out, nx, ny, ib, ie, jb, je, in);
				}
			}
		}
}

template< typename T >
void nse::put_sub_array_omp(T* _RESTRICT out,
	const int nx, const int ny,
	const int ib, const int ie, const int jb, const int je,
	const T* _RESTRICT const in)
{
	const int cy = je - jb + 1;
	const int block_size = cy * sizeof(T);

	int i, j, idx, odx;

	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()

#pragma omp for nowait
		for (i = ib; i <= ie; i++)
		{
			idx = (i - ib) * cy - jb;
			odx = i * ny;
			for (j = jb; j <= je; j++)
				out[odx + j] = in[idx + j];
		}
	}
	else
	{
#pragma omp for nowait
		for (i = ib; i <= ie; i++) {
			idx = (i - ib) * cy;
			odx = i * ny + jb;
			memcpy(&out[odx], &in[idx], block_size);
		}
	}
}

// * implementation: get 2d sub array - colored * //
// ---------------------------------------------- //
template< nse::memType memIN, nse::memType memOUT, typename T >
void nse::get_sub_array(const T* _RESTRICT const in, const int color,
	const int nx, const int ny,
	const int ib, const int ie, const int jb, const int je,
	T* _RESTRICT out)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (((memIN == memCPU) && (memOUT == memGPU)) ||
		((memIN == memGPU) && (memOUT == memCPU)))
	{
		const int c_size = get_num_colored(color, ib, ie, jb, je);

		T* buf;
		int buf_id;
		if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
		if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);

		get_sub_array<memIN, memIN>(in, color, nx, ny, ib, ie, jb, je, buf);
		mcopy<memOUT, memIN>(out, buf, c_size);

		if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
		if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
	}
	else
		if ((memIN == memGPU) && (memOUT == memGPU)) {
			nse_gpu::get_sub_array(in, color, nx, ny, ib, ie, jb, je, out);
		}
		else
#endif
		{	// memCPU -> memCPU //
#ifdef USE_DEPRECATED_COLOR_CP
			int i, j;
			int idx, odx = 0;
			int sh = (ib + jb + color) & 1;

			for (i = ib; i <= ie; i++) {

				idx = i * ny + jb + sh;
				for (j = jb + sh; j <= je; j += 2, odx++, idx += 2) {
					out[odx] = in[idx];
				}
				sh = !sh;       // change shift for next column //
			}
#else

			if (omp_in_parallel()) {
				get_sub_array_omp(in, color, nx, ny, ib, ie, jb, je, out);
			}
			else
			{
#pragma omp parallel shared( out ) 
				{
					get_sub_array_omp(in, color, nx, ny, ib, ie, jb, je, out);
				}
			}
#endif
		}
}

template< typename T >
void nse::get_sub_array_omp(const T* _RESTRICT const in, const int color,
	const int nx, const int ny,
	const int ib, const int ie, const int jb, const int je,
	T* _RESTRICT out)
{
	int i, j, idx, odx, sh;

	// colored column size //
	const int cy = (je - jb + 1);
	const int cyh = get_num_colored(color, // half-colored-column size
		ib, ib, jb, je);

#pragma omp for nowait
	for (i = ib; i <= ie; i++) {
		sh = (i + jb + color) & 1;
		idx = i * ny;
		odx = ((i - ib) >> 1) * cy + ((i - ib) & 1) * cyh;
		for (j = jb + sh; j <= je; j += 2, odx++) {
			out[odx] = in[idx + j];
		}
	}
}

// * implementation: put 2d sub array - colored * //
// ---------------------------------------------- //
template< nse::memType memOUT, nse::memType memIN, typename T >
void nse::put_sub_array(T* _RESTRICT out, const int color,
	const int nx, const int ny,
	const int ib, const int ie, const int jb, const int je,
	const T* _RESTRICT const in)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (((memIN == memCPU) && (memOUT == memGPU)) ||
		((memIN == memGPU) && (memOUT == memCPU)))
	{
		const int c_size = get_num_colored(color, ib, ie, jb, je);

		T* buf;
		int buf_id;
		if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
		if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);

		mcopy<memOUT, memIN>(buf, in, c_size);
		put_sub_array<memOUT, memOUT>(out, color, nx, ny, ib, ie, jb, je, buf);

		if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
		if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id);
	}
	else
		if ((memIN == memGPU) && (memOUT == memGPU)) {
			nse_gpu::put_sub_array(out, color, nx, ny, ib, ie, jb, je, in);
		}
		else
#endif
		{	// memCPU <- memCPU //
#ifdef USE_DEPRECATED_COLOR_CP
			int i, j;
			int idx, odx = 0;
			int sh = (ib + jb + color) & 1;

			for (i = ib; i <= ie; i++) {

				idx = i * ny + jb + sh;
				for (j = jb + sh; j <= je; j += 2, odx++, idx += 2) {
					out[idx] = in[odx];
				}
				sh = !sh;       // change shift for next column //
			}
#else

			if (omp_in_parallel()) {
				put_sub_array_omp(out, color, nx, ny, ib, ie, jb, je, in);
			}
			else
			{
#pragma omp parallel shared( out ) 
				{
					put_sub_array_omp(out, color, nx, ny, ib, ie, jb, je, in);
				}
			}
#endif
		}
}

template< typename T >
void nse::put_sub_array_omp(T* _RESTRICT out, const int color,
	const int nx, const int ny,
	const int ib, const int ie, const int jb, const int je,
	const T* _RESTRICT const in)
{
	int i, j, idx, odx, sh;

	// colored column size //
	const int cy = (je - jb + 1);
	const int cyh = get_num_colored(color, // half-colored-column size
		ib, ib, jb, je);

#pragma omp for nowait
	for (i = ib; i <= ie; i++) {
		sh = (i + jb + color) & 1;
		idx = ((i - ib) >> 1) * cy + ((i - ib) & 1) * cyh;
		odx = i * ny;
		for (j = jb + sh; j <= je; j += 2, idx++) {
			out[odx + j] = in[idx];
		}
	}
}

// * implementation: copy 2d sub array * //
// ------------------------------------ //
template< nse::memType memOUT, nse::memType memIN, typename T >
inline void nse::copy_sub_array(T* _RESTRICT out,
	const int nx, const int ny,
	const int posx, const int posy,
	const T* _RESTRICT const in,
	const int subnx, const int subny,
	const int ib, const int ie,
	const int jb, const int je)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (((memIN == memCPU) && (memOUT == memGPU)) ||
		((memIN == memGPU) && (memOUT == memCPU)))
	{
		const int c_size = (ie - ib + 1) * (je - jb + 1);

		T* buf;
		int buf_id;
		if (memOUT == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
		if (memOUT == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);

		get_sub_array<memIN, memOUT>(in, subnx, subny, ib, ie, jb, je, buf);
		put_sub_array<memOUT, memOUT>(out, nx, ny,
			posx, posx + (ie - ib), posy, posy + (je - jb), buf);

		if (memOUT == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
		if (memOUT == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
	}
	else
		if ((memIN == memGPU) && (memOUT == memGPU)) {
			nse_gpu::copy_sub_array(out, nx, ny, posx, posy,
				in, subnx, subny, ib, ie, jb, je);
		}
		else
#endif
		{	// memCPU <- memCPU //
			if (omp_in_parallel()) {
				copy_sub_array_omp(out, nx, ny, posx, posy,
					in, subnx, subny, ib, ie, jb, je);
			}
			else
			{
#pragma omp parallel shared( out ) 
				{
					copy_sub_array_omp(out, nx, ny, posx, posy,
						in, subnx, subny, ib, ie, jb, je);
				}
			}
		}
}

template< typename T >
void nse::copy_sub_array_omp(T* _RESTRICT out,
	const int nx, const int ny,
	const int posx, const int posy,
	const T* _RESTRICT const in,
	const int subnx, const int subny,
	const int ib, const int ie,
	const int jb, const int je)
{
	const int block_size = (je - jb + 1) * sizeof(T);
	const int shx = posx - ib;

	int i, idx, odx;

#pragma omp for nowait
	for (i = ib; i <= ie; i++) {
		odx = (i + shx) * ny + posy;
		idx = i * subny + jb;

		memcpy(&out[odx], &in[idx], block_size);
	}
}

// * implementation: get number of colored elements //
// ------------------------------------------------ //
inline int nse::get_num_colored(const int color,
	const int ib, const int ie, const int jb, const int je)
{
	const int length = ie - ib + 1;
	const int width = je - jb + 1;

	if ((length & 1) && (width & 1)) {

		const int sh = !((ib + jb + color) & 1);
		return ((length - 1) >> 1) * width + ((width + sh) >> 1);
	}
	else
		return ((length * width) >> 1);
}