#pragma once

#include <string.h>
#include "nse-sys.h"
#include "nse-alloc.h"

#ifndef EXCLUDE_GPU_BRANCH
#include "grid-common3d.cuh"
#include "cuda-stx.cuh"
#endif

namespace nse
{
	// * null halo cells * //
	// ------------------- //
	template< memType mem = memCPU, typename T >
	void null_ghost_halo(T* _RESTRICT x,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz);

	template< memType mem = memCPU, typename T >
	void null_halo(T* _RESTRICT x,
		const int nx, const int ny, const int nz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke);

	// * apply -x, -y, -z periodicity * //
	// -------------------------------- //
	template< memType mem = memCPU, typename T >
	void apply_periodic_x(T* _RESTRICT x,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);
	template< memType mem = memCPU, typename T >
	void apply_periodic_y(T* _RESTRICT x,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);
	template< memType mem = memCPU, typename T >
	void apply_periodic_z(T* _RESTRICT x,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);

	// * apply -x, -y, -z periodicity - colored * //
	// ------------------------------------------ //
	template< memType mem = memCPU, typename T >
	void apply_periodic_x(T* _RESTRICT x, const int color,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);
	template< memType mem = memCPU, typename T >
	void apply_periodic_y(T* _RESTRICT x, const int color,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);
	template< memType mem = memCPU, typename T >
	void apply_periodic_z(T* _RESTRICT x, const int color,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);

	// * get 3d sub array //
	// ------------------ //
	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
	void get_sub_array(const T* _RESTRICT const in,
		const int nx, const int ny, const int nz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke,
		T* _RESTRICT out);

	// * put 3d sub array //
	// ------------------ //
	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
	void put_sub_array(T* _RESTRICT out,
		const int nx, const int ny, const int nz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke,
		const T* _RESTRICT const in);

	// * get 3d sub array - colored //
	// ---------------------------- //
	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
	void get_sub_array(const T* _RESTRICT const in, const int color,
		const int nx, const int ny, const int nz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke,
		T* _RESTRICT out);

	// * put 3d sub array - colored //
	// ---------------------------- //
	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
	void put_sub_array(T* _RESTRICT out, const int color,
		const int nx, const int ny, const int nz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke,
		const T* _RESTRICT const in);

	// * copy 3d sub array //
	// ------------------- //
	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
	void copy_sub_array(T* _RESTRICT out,
		const int nx, const int ny, const int nz,
		const int posx, const int posy, const int posz,
		const T* _RESTRICT const in,
		const int subnx, const int subny, const int subnz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke);


	// * get number of colored elements //
	// -------------------------------- //
	int get_num_colored(const int color,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke);
}


namespace nse
{
	// * null halo cells * //
	// ------------------- //
	template< typename T >
	void null_ghost_halo_omp(T* _RESTRICT x,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz);

	template< typename T >
	void null_halo_omp(T* _RESTRICT x,
		const int nx, const int ny, const int nz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke);

	// * apply -x, -y, -z periodicity * //
	// -------------------------------- //
	template< typename T >
	void apply_periodic_x_omp(T* _RESTRICT x,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);
	template< typename T >
	void apply_periodic_y_omp(T* _RESTRICT x,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);
	template< typename T >
	void apply_periodic_z_omp(T* _RESTRICT x,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);

	// * apply -x, -y, -z periodicity - colored * //
	// ------------------------------------------ //
	template< typename T >
	void apply_periodic_x_omp(T* _RESTRICT x, const int color,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);
	template< typename T >
	void apply_periodic_y_omp(T* _RESTRICT x, const int color,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);
	template< typename T >
	void apply_periodic_z_omp(T* _RESTRICT x, const int color,
		const int nx, const int ny, const int nz,
		const int gcx, const int gcy, const int gcz,
		const int hx, const int hy, const int hz);

	// * get 3d sub array //
	// ------------------ //
	template< typename T >
	void get_sub_array_omp(const T* _RESTRICT const in,
		const int nx, const int ny, const int nz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke,
		T* _RESTRICT out);

	// * put 3d sub array //
	// ------------------ //
	template< typename T >
	void put_sub_array_omp(T* _RESTRICT out,
		const int nx, const int ny, const int nz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke,
		const T* _RESTRICT const in);

	// * get 3d sub array - colored //
	// ---------------------------- //
	template< typename T >
	void get_sub_array_omp(const T* _RESTRICT const in, const int color,
		const int nx, const int ny, const int nz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke,
		T* _RESTRICT out);

	// * put 3d sub array - colored //
	// ---------------------------- //
	template< typename T >
	void put_sub_array_omp(T* _RESTRICT out, const int color,
		const int nx, const int ny, const int nz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke,
		const T* _RESTRICT const in);

	// * copy 3d sub array //
	// ------------------- //
	template< typename T >
	void copy_sub_array_omp(T* _RESTRICT out,
		const int nx, const int ny, const int nz,
		const int posx, const int posy, const int posz,
		const T* _RESTRICT const in,
		const int subnx, const int subny, const int subnz,
		const int ib, const int ie,
		const int jb, const int je,
		const int kb, const int ke);
}



// * implementation: null halo cells * //
// ----------------------------------- //
template< nse::memType mem, typename T >
inline void nse::null_ghost_halo(
	T* _RESTRICT x,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU)
		nse_gpu::null_ghost_halo(x, nx, ny, nz, gcx, gcy, gcz);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			null_ghost_halo_omp(x, nx, ny, nz, gcx, gcy, gcz);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				null_ghost_halo_omp(x, nx, ny, nz, gcx, gcy, gcz);
			}
		}
	}
}

template< typename T >
void nse::null_ghost_halo_omp(
	T* _RESTRICT x,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz)
{
	const int nyz = ny * nz;
	int i, j, k, idx;

	// -x null halo: west
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = 0; i < gcx; i++) {
		for (j = 0; j < ny; j++) {
			idx = i * nyz + j * nz;
			for (k = 0; k < nz; k++)
				x[idx + k] = (T)0;
		}
	}

	// -x null halo: east
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = 0; i < gcx; i++) {
		for (j = 0; j < ny; j++) {
			idx = (nx - gcx + i) * nyz + j * nz;
			for (k = 0; k < nz; k++)
				x[idx + k] = (T)0;
		}
	}

	// -y null halo: south //
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = gcx; i < nx - gcx; i++) {
		for (j = 0; j < gcy; j++) {
			idx = i * nyz + j * nz;
			for (k = 0; k < nz; k++)
				x[idx + k] = (T)0;
		}
	}

	// -y null halo: north //
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = gcx; i < nx - gcx; i++) {
		for (j = ny - gcy; j < ny; j++) {
			idx = i * nyz + j * nz;
			for (k = 0; k < nz; k++)
				x[idx + k] = (T)0;
		}

	}

	// -z null halo: bottom //
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = gcx; i < nx - gcx; i++) {
		for (j = gcy; j < ny - gcy; j++) {
			idx = i * nyz + j * nz;
			for (k = 0; k < gcz; k++)
				x[idx + k] = (T)0;
		}
	}

	// -z null halo: top //
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = gcx; i < nx - gcx; i++) {
		for (j = gcy; j < ny - gcy; j++) {
			idx = i * nyz + j * nz;
			for (k = nz - gcz; k < nz; k++)
				x[idx + k] = (T)0;
		}
	}
}

// * implementation: null halo cells * //
// ----------------------------------- //
template< nse::memType mem, typename T >
inline void nse::null_halo(
	T* _RESTRICT x,
	const int nx, const int ny, const int nz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU)
		nse_gpu::null_halo(x, nx, ny, nz, ib, ie, jb, je, kb, ke);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			null_halo_omp(x, nx, ny, nz,
				ib, ie, jb, je, kb, ke);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				null_halo_omp(x, nx, ny, nz,
					ib, ie, jb, je, kb, ke);
			}
		}
	}
}

template< typename T >
void nse::null_halo_omp(
	T* _RESTRICT x,
	const int nx, const int ny, const int nz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke)
{
	const int nyz = ny * nz;
	int i, j, k, idx;

	// -x null halo: west
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = 0; i < ib; i++) {
		for (j = 0; j < ny; j++) {
			idx = i * nyz + j * nz;
			for (k = 0; k < nz; k++)
				x[idx + k] = (T)0;
		}
	}

	// -x null halo: east
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ie + 1; i < nx; i++) {
		for (j = 0; j < ny; j++) {
			idx = i * nyz + j * nz;
			for (k = 0; k < nz; k++)
				x[idx + k] = (T)0;
		}
	}

	// -y null halo: south //
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {
		for (j = 0; j < jb; j++) {
			idx = i * nyz + j * nz;
			for (k = 0; k < nz; k++)
				x[idx + k] = (T)0;
		}
	}

	// -y null halo: north //
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {
		for (j = je + 1; j < ny; j++) {
			idx = i * nyz + j * nz;
			for (k = 0; k < nz; k++)
				x[idx + k] = (T)0;
		}
	}

	// -z null halo: bottom //
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {
		for (j = jb; j <= je; j++) {
			idx = i * nyz + j * nz;
			for (k = 0; k < kb; k++)
				x[idx + k] = (T)0;
		}
	}

	// -z null halo: top //
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {
		for (j = jb; j <= je; j++) {
			idx = i * nyz + j * nz;
			for (k = ke + 1; k < nz; k++)
				x[idx + k] = (T)0;
		}
	}
}

// * apply -x, -y, -z periodicity * //
// -------------------------------- //
template< nse::memType mem, typename T >
inline void nse::apply_periodic_x(T* _RESTRICT x,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU)
		nse_gpu::apply_periodic_x(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			apply_periodic_x_omp(x, nx, ny, nz,
				gcx, gcy, gcz, hx, hy, hz);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				apply_periodic_x_omp(x, nx, ny, nz,
					gcx, gcy, gcz, hx, hy, hz);
			}
		}
	}
}

template< nse::memType mem, typename T >
inline void nse::apply_periodic_y(T* _RESTRICT x,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU)
		nse_gpu::apply_periodic_y(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			apply_periodic_y_omp(x, nx, ny, nz,
				gcx, gcy, gcz, hx, hy, hz);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				apply_periodic_y_omp(x, nx, ny, nz,
					gcx, gcy, gcz, hx, hy, hz);
			}
		}
	}
}

template< nse::memType mem, typename T >
inline void nse::apply_periodic_z(T* _RESTRICT x,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU)
		nse_gpu::apply_periodic_z(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			apply_periodic_z_omp(x, nx, ny, nz,
				gcx, gcy, gcz, hx, hy, hz);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				apply_periodic_z_omp(x, nx, ny, nz,
					gcx, gcy, gcz, hx, hy, hz);
			}
		}
	}
}


template< typename T >
void nse::apply_periodic_x_omp(T* _RESTRICT x,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
	const int nyz = ny * nz;
	const int stride = (nx - 2 * gcx) * nyz;
	const int shx = hx * nyz;
	const int jb = gcy - hy, je = ny - gcy + hy - 1;
	const int kb = gcz - hz, ke = nz - gcz + hz - 1;
	const int block_size = (ke - kb + 1) * sizeof(T);

	int i, j, k, idx;

	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = gcx - hx; i < gcx; i++) {	// west peridocity //
			for (j = jb; j <= je; j++) {
				idx = i * nyz + j * nz;
				for (k = kb; k <= ke; k++)
					x[idx + k] = x[idx + stride + k];
			}
		}

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = gcx - hx; i < gcx; i++) {	// east periodicity //
			for (j = jb; j <= je; j++) {
				idx = i * nyz + j * nz + stride + shx;
				for (k = kb; k <= ke; k++)
					x[idx + k] = x[idx - stride + k];
			}
		}
	}
	else
	{
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = gcx - hx; i < gcx; i++) {	// west peridocity //
			for (j = jb; j <= je; j++) {
				idx = i * nyz + j * nz + kb;
				memcpy(&x[idx], &x[idx + stride], block_size);
			}
		}

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = gcx - hx; i < gcx; i++) {	// east peridocity //
			for (j = jb; j <= je; j++) {
				idx = i * nyz + j * nz + kb + shx;
				memcpy(&x[idx + stride], &x[idx], block_size);
			}
		}
	}
}
template< typename T >
void nse::apply_periodic_y_omp(T* _RESTRICT x,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
	const int nyz = ny * nz;
	const int stride = (ny - 2 * gcy) * nz;
	const int shy = hy * nz;
	const int ib = gcx - hx, ie = nx - gcx + hx - 1;
	const int kb = gcz - hz, ke = nz - gcz + hz - 1;
	const int block_size = (ke - kb + 1) * sizeof(T);

	int i, j, k, idx;

	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = ib; i <= ie; i++) {	// south periodicity //
			for (j = gcy - hy; j < gcy; j++) {
				idx = i * nyz + j * nz;
				for (k = kb; k <= ke; k++)
					x[idx + k] = x[idx + stride + k];
			}
		}

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = ib; i <= ie; i++) {	// north periodicity //
			for (j = gcy - hy; j < gcy; j++) {
				idx = i * nyz + j * nz + stride + shy;
				for (k = kb; k <= ke; k++)
					x[idx + k] = x[idx - stride + k];
			}
		}
	}
	else
	{
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = ib; i <= ie; i++) {	// south periodicity //
			for (j = gcy - hy; j < gcy; j++) {
				idx = i * nyz + j * nz + kb;
				memcpy(&x[idx], &x[idx + stride], block_size);
			}
		}

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = ib; i <= ie; i++) {	// north periodicity //
			for (j = gcy - hy; j < gcy; j++) {
				idx = i * nyz + j * nz + kb + shy;
				memcpy(&x[idx + stride], &x[idx], block_size);
			}
		}
	}
}
template< typename T >
void nse::apply_periodic_z_omp(T* _RESTRICT x,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
	const int nyz = ny * nz;
	const int stride = (nz - 2 * gcz);
	const int ib = gcx - hx, ie = nx - gcx + hx - 1;
	const int jb = gcy - hy, je = ny - gcy + hy - 1;

	int i, j, k, idx;

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {	// bottom periodicity //
		for (j = jb; j <= je; j++) {
			idx = i * nyz + j * nz;
			for (k = gcz - hz; k < gcz; k++)
				x[idx + k] = x[idx + stride + k];
		}
	}

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {	// top periodicity //
		for (j = jb; j <= je; j++) {
			idx = i * nyz + j * nz;
			for (k = nz - gcz; k < nz - gcz + hz; k++)
				x[idx + k] = x[idx - stride + k];
		}
	}
}

// * apply -x, -y, -z periodicity - colored * //
// ------------------------------------------ //
template< nse::memType mem, typename T >
inline void nse::apply_periodic_x(T* _RESTRICT x, const int color,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU) {
		nse_gpu::apply_periodic_x(x, color,
			nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
	}
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			apply_periodic_x_omp(x, color, nx, ny, nz,
				gcx, gcy, gcz, hx, hy, hz);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				apply_periodic_x_omp(x, color, nx, ny, nz,
					gcx, gcy, gcz, hx, hy, hz);
			}
		}
	}
}

template< nse::memType mem, typename T >
inline void nse::apply_periodic_y(T* _RESTRICT x, const int color,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU) {
		nse_gpu::apply_periodic_y(x, color,
			nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
	}
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			apply_periodic_y_omp(x, color, nx, ny, nz,
				gcx, gcy, gcz, hx, hy, hz);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				apply_periodic_y_omp(x, color, nx, ny, nz,
					gcx, gcy, gcz, hx, hy, hz);
			}
		}
	}
}

template< nse::memType mem, typename T >
inline void nse::apply_periodic_z(T* _RESTRICT x, const int color,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (mem == memGPU) {
		nse_gpu::apply_periodic_z(x, color,
			nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
	}
	else
#endif
	{	// memCPU //
		if (omp_in_parallel()) {
			apply_periodic_z_omp(x, color, nx, ny, nz,
				gcx, gcy, gcz, hx, hy, hz);
		}
		else
		{
#pragma omp parallel shared( x ) 
			{
				apply_periodic_z_omp(x, color, nx, ny, nz,
					gcx, gcy, gcz, hx, hy, hz);
			}
		}
	}
}

template< typename T >
void nse::apply_periodic_x_omp(T* _RESTRICT x, const int color,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
	const int nyz = ny * nz;
	const int stride = (nx - 2 * gcx) * nyz;
	const int ish = hx + nx - 2 * gcx;
	const int shx = hx * nyz;
	const int jb = gcy - hy, je = ny - gcy + hy - 1;
	const int kb = gcz - hz, ke = nz - gcz + hz - 1;

	int i, j, k, idx, csh;


#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = gcx - hx; i < gcx; i++) {			// west periodicity //
		for (j = jb; j <= je; j++) {
			csh = (i + j + kb + color) & 1;
			idx = i * nyz + j * nz;
			for (k = kb + csh; k <= ke; k += 2)
				x[idx + k] = x[idx + stride + k];
		}
	}

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = gcx - hx; i < gcx; i++) {			// east periodicity //
		for (j = jb; j <= je; j++) {
			csh = (i + ish + j + kb + color) & 1;
			idx = i * nyz + j * nz + stride + shx;
			for (k = kb + csh; k <= ke; k += 2)
				x[idx + k] = x[idx - stride + k];
		}
	}
}
template< typename T >
void nse::apply_periodic_y_omp(T* _RESTRICT x, const int color,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
	const int nyz = ny * nz;
	const int stride = (ny - 2 * gcy) * nz;
	const int jsh = hy + ny - 2 * gcy;
	const int shy = hy * nz;
	const int ib = gcx - hx, ie = nx - gcx + hx - 1;
	const int kb = gcz - hz, ke = nz - gcz + hz - 1;

	int i, j, k, idx, csh;

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {	// south periodicity //
		for (j = gcy - hy; j < gcy; j++) {
			csh = (i + j + kb + color) & 1;
			idx = i * nyz + j * nz;
			for (k = kb + csh; k <= ke; k += 2)
				x[idx + k] = x[idx + stride + k];
		}
	}

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {	// north periodicity //
		for (j = gcy - hy; j < gcy; j++) {
			csh = (i + j + jsh + kb + color) & 1;
			idx = i * nyz + j * nz + stride + shy;
			for (k = kb + csh; k <= ke; k += 2)
				x[idx + k] = x[idx - stride + k];
		}
	}
}
template< typename T >
void nse::apply_periodic_z_omp(T* _RESTRICT x, const int color,
	const int nx, const int ny, const int nz,
	const int gcx, const int gcy, const int gcz,
	const int hx, const int hy, const int hz)
{
	const int nyz = ny * nz;
	const int stride = (nz - 2 * gcz);
	const int ib = gcx - hx, ie = nx - gcx + hx - 1;
	const int jb = gcy - hy, je = ny - gcy + hy - 1;

	int i, j, k, idx, csh;

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {	// bottom periodicity //
		for (j = jb; j <= je; j++) {
			idx = i * nyz + j * nz;
			csh = (i + j + gcz - hz + color) & 1;
			for (k = gcz - hz + csh; k < gcz; k += 2)
				x[idx + k] = x[idx + stride + k];
		}
	}

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {	// top periodicity //
		for (j = jb; j <= je; j++) {
			idx = i * nyz + j * nz;
			csh = (i + j + nz - gcz + color) & 1;
			for (k = nz - gcz + csh; k < nz - gcz + hz; k += 2)
				x[idx + k] = x[idx - stride + k];
		}
	}
}

// * implementation: get 3d sub array * //
// ------------------------------------ //
template< nse::memType memIN, nse::memType memOUT, typename T >
inline void nse::get_sub_array(const T* _RESTRICT const in,
	const int nx, const int ny, const int nz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke,
	T* _RESTRICT out)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (((memIN == memCPU) && (memOUT == memGPU)) ||
		((memIN == memGPU) && (memOUT == memCPU)))
	{
		const int c_size = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);

		T* buf;
		int buf_id;
		if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
		if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);

		get_sub_array<memIN, memIN>(in, nx, ny, nz,
			ib, ie, jb, je, kb, ke, buf);
		mcopy<memOUT, memIN>(out, buf, c_size);

		if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
		if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
	}
	else
		if ((memIN == memGPU) && (memOUT == memGPU)) {
			nse_gpu::get_sub_array(in, nx, ny, nz,
				ib, ie, jb, je, kb, ke, out);
		}
		else
#endif
		{	// memCPU -> memCPU //
			if (omp_in_parallel()) {
				get_sub_array_omp(in, nx, ny, nz,
					ib, ie, jb, je, kb, ke, out);
			}
			else
			{
#pragma omp parallel shared( out ) 
				{
					get_sub_array_omp(in, nx, ny, nz,
						ib, ie, jb, je, kb, ke, out);
				}
			}
		}
}

template< typename T >
void nse::get_sub_array_omp(const T* _RESTRICT const in,
	const int nx, const int ny, const int nz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke,
	T* _RESTRICT out)
{
	const int nyz = ny * nz;
	const int cy = je - jb + 1, cz = ke - kb + 1;
	const int cyz = cy * cz;
	const int block_size = cz * sizeof(T);

	int i, j, k, idx, odx;

	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = ib; i <= ie; i++) {
			for (j = jb; j <= je; j++) {
				idx = i * nyz + j * nz;
				odx = (i - ib) * cyz + (j - jb) * cz - kb;
				for (k = kb; k <= ke; k++)
					out[odx + k] = in[idx + k];
			}
		}
	}
	else
	{
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = ib; i <= ie; i++) {
			for (j = jb; j <= je; j++) {
				idx = i * nyz + j * nz + kb;
				odx = (i - ib) * cyz + (j - jb) * cz;
				memcpy(&out[odx], &in[idx], block_size);
			}
		}
	}
}

// * implementation: put 3d sub array * //
// ------------------------------------ //
template< nse::memType memOUT, nse::memType memIN, typename T >
inline void nse::put_sub_array(T* _RESTRICT out,
	const int nx, const int ny, const int nz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke,
	const T* _RESTRICT const in)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (((memIN == memCPU) && (memOUT == memGPU)) ||
		((memIN == memGPU) && (memOUT == memCPU)))
	{
		const int c_size = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);

		T* buf;
		int buf_id;
		if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
		if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);

		mcopy<memOUT, memIN>(buf, in, c_size);
		put_sub_array<memOUT, memOUT>(out, nx, ny, nz,
			ib, ie, jb, je, kb, ke, buf);

		if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
		if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id);
	}
	else
		if ((memIN == memGPU) && (memOUT == memGPU)) {
			nse_gpu::put_sub_array(out, nx, ny, nz,
				ib, ie, jb, je, kb, ke, in);
		}
		else
#endif
		{	// memCPU <- memCPU 
			if (omp_in_parallel()) {
				put_sub_array_omp(out, nx, ny, nz,
					ib, ie, jb, je, kb, ke, in);
			}
			else
			{
#pragma omp parallel shared( out ) 
				{
					put_sub_array_omp(out, nx, ny, nz,
						ib, ie, jb, je, kb, ke, in);
				}
			}
		}
}


template< typename T >
void nse::put_sub_array_omp(T* _RESTRICT out,
	const int nx, const int ny, const int nz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke,
	const T* _RESTRICT const in)
{
	const int nyz = ny * nz;
	const int cy = je - jb + 1, cz = ke - kb + 1;
	const int cyz = cy * cz;
	const int block_size = cz * sizeof(T);

	int i, j, k, idx, odx;

	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = ib; i <= ie; i++) {
			for (j = jb; j <= je; j++) {
				odx = i * nyz + j * nz;
				idx = (i - ib) * cyz + (j - jb) * cz - kb;
				for (k = kb; k <= ke; k++)
					out[odx + k] = in[idx + k];
			}
		}
	}
	else
	{
#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse(2) nowait
#else
#pragma omp for nowait
#endif
		for (i = ib; i <= ie; i++) {
			for (j = jb; j <= je; j++) {
				odx = i * nyz + j * nz + kb;
				idx = (i - ib) * cyz + (j - jb) * cz;
				memcpy(&out[odx], &in[idx], block_size);
			}
		}
	}
}

// * implementation: get 3d sub array - colored * //
// ---------------------------------------------- //
template< nse::memType memIN, nse::memType memOUT, typename T >
void nse::get_sub_array(const T* _RESTRICT const in, const int color,
	const int nx, const int ny, const int nz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke,
	T* _RESTRICT out)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (((memIN == memCPU) && (memOUT == memGPU)) ||
		((memIN == memGPU) && (memOUT == memCPU)))
	{
		const int c_size = get_num_colored(color, ib, ie, jb, je, kb, ke);

		T* buf;
		int buf_id;
		if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
		if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);

		get_sub_array<memIN, memIN>(in, color, nx, ny, nz,
			ib, ie, jb, je, kb, ke, buf);
		mcopy<memOUT, memIN>(out, buf, c_size);

		if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
		if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
	}
	else
		if ((memIN == memGPU) && (memOUT == memGPU)) {
			nse_gpu::get_sub_array(in, color, nx, ny, nz,
				ib, ie, jb, je, kb, ke, out);
		}
		else
#endif
		{	// memCPU -> memCPU //
#ifdef USE_DEPRECATED_COLOR_CP
			const int nyz = ny * nz;

			int i, j, k;
			int idx, odx = 0;
			int sh;

			for (i = ib; i <= ie; i++)
			{
				idx = i * nyz + jb * nz;
				sh = (i + jb + kb + color) & 1;

				for (j = jb; j <= je; j++, idx += nz) {
					for (k = kb + sh; k <= ke; k += 2, odx++) {
						out[odx] = in[idx + k];
					}
					sh = !sh;	// change shift for next column //
				}
			}
#else

			if (omp_in_parallel()) {
				get_sub_array_omp(in, color, nx, ny, nz,
					ib, ie, jb, je, kb, ke, out);
			}
			else
			{
#pragma omp parallel shared( out ) 
				{
					get_sub_array_omp(in, color, nx, ny, nz,
						ib, ie, jb, je, kb, ke, out);
				}
			}
#endif
		}
}


template< typename T >
void nse::get_sub_array_omp(const T* _RESTRICT const in, const int color,
	const int nx, const int ny, const int nz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke,
	T* _RESTRICT out)
{
	const int nyz = ny * nz;
	int i, j, k, idx, odx, sh;

	// colored plane and column sizes //
	const int cz = (ke - kb + 1);
	const int cyz = (je - jb + 1) * cz;
	const int cyzh = get_num_colored(color,	// half-colored-plane size
		ib, ib, jb, je, kb, ke);
#ifdef USE_OPENMP_2D_CYCLE
	const int czh = get_num_colored(color,	// half-colored-column size
		ib, ib, jb, jb, kb, ke);
#endif

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse( 2 ) nowait
	for (i = ib; i <= ie; i++)
	{
		for (j = jb; j <= je; j++)
		{
			sh = (i + j + kb + color) & 1;
			idx = i * nyz + j * nz;
			odx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh +
				((j - jb) >> 1) * cz + ((j - jb) & 1) * czh;
#else
#pragma omp for nowait
	for (i = ib; i <= ie; i++)
	{
		sh = (i + jb + kb + color) & 1;
		idx = i * nyz + jb * nz;
		odx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh;
		for (j = jb; j <= je; j++, idx += nz)
		{
#endif
			for (k = kb + sh; k <= ke; k += 2, odx++) {
				out[odx] = in[idx + k];
			}

#ifndef USE_OPENMP_2D_CYCLE
			sh = !sh;	// change shift for next column //
#endif
		}
	}
}

// * implementation: put 3d sub array - colored * //
// ---------------------------------------------- //
template< nse::memType memOUT, nse::memType memIN, typename T >
void nse::put_sub_array(T* _RESTRICT out, const int color,
	const int nx, const int ny, const int nz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke,
	const T* _RESTRICT const in)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (((memIN == memCPU) && (memOUT == memGPU)) ||
		((memIN == memGPU) && (memOUT == memCPU)))
	{
		const int c_size = get_num_colored(color, ib, ie, jb, je, kb, ke);

		T* buf;
		int buf_id;
		if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
		if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);

		mcopy<memOUT, memIN>(buf, in, c_size);
		put_sub_array<memOUT, memOUT>(out, color, nx, ny, nz,
			ib, ie, jb, je, kb, ke, buf);

		if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
		if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id);
	}
	else
		if ((memIN == memGPU) && (memOUT == memGPU)) {
			nse_gpu::put_sub_array(out, color, nx, ny, nz,
				ib, ie, jb, je, kb, ke, in);
		}
		else
#endif
		{	// memCPU <- memCPU //
#ifdef USE_DEPRECATED_COLOR_CP
			const int nyz = ny * nz;

			int i, j, k;
			int odx, idx = 0;
			int sh;

			for (i = ib; i <= ie; i++)
			{
				odx = i * nyz + jb * nz;
				sh = (i + jb + kb + color) & 1;

				for (j = jb; j <= je; j++, odx += nz) {
					for (k = kb + sh; k <= ke; k += 2, idx++) {
						out[odx + k] = in[idx];
					}
					sh = !sh;	// change shift for next column //
				}
			}
#else

			if (omp_in_parallel()) {
				put_sub_array_omp(out, color, nx, ny, nz,
					ib, ie, jb, je, kb, ke, in);
			}
			else
			{
#pragma omp parallel shared( out ) 
				{
					put_sub_array_omp(out, color, nx, ny, nz,
						ib, ie, jb, je, kb, ke, in);
				}
			}
#endif
		}
}


template< typename T >
void nse::put_sub_array_omp(T* _RESTRICT out, const int color,
	const int nx, const int ny, const int nz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke,
	const T* _RESTRICT const in)
{
	const int nyz = ny * nz;
	int i, j, k, idx, odx, sh;

	// colored plane and column sizes //
	const int cz = (ke - kb + 1);
	const int cyz = (je - jb + 1) * cz;
	const int cyzh = get_num_colored(color,	// half-colored-plane size
		ib, ib, jb, je, kb, ke);
#ifdef USE_OPENMP_2D_CYCLE
	const int czh = get_num_colored(color,	// half-colored-column size
		ib, ib, jb, jb, kb, ke);
#endif

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse( 2 ) nowait
	for (i = ib; i <= ie; i++)
	{
		for (j = jb; j <= je; j++)
		{
			sh = (i + j + kb + color) & 1;
			odx = i * nyz + j * nz;
			idx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh +
				((j - jb) >> 1) * cz + ((j - jb) & 1) * czh;
#else
#pragma omp for nowait
	for (i = ib; i <= ie; i++)
	{
		sh = (i + jb + kb + color) & 1;
		odx = i * nyz + jb * nz;
		idx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh;
		for (j = jb; j <= je; j++, odx += nz)
		{
#endif
			for (k = kb + sh; k <= ke; k += 2, idx++) {
				out[odx + k] = in[idx];
			}

#ifndef USE_OPENMP_2D_CYCLE
			sh = !sh;	// change shift for next column //
#endif
		}
	}
}

// * implementation: copy 3d sub array * //
// ------------------------------------ //
template< nse::memType memOUT, nse::memType memIN, typename T >
inline void nse::copy_sub_array(T* _RESTRICT out,
	const int nx, const int ny, const int nz,
	const int posx, const int posy, const int posz,
	const T* _RESTRICT const in,
	const int subnx, const int subny, const int subnz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke)
{
#ifndef EXCLUDE_GPU_BRANCH
	if (((memIN == memCPU) && (memOUT == memGPU)) ||
		((memIN == memGPU) && (memOUT == memCPU)))
	{
		const int c_size = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);

		T* buf;
		int buf_id;
		if (memOUT == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
		if (memOUT == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);

		get_sub_array<memIN, memOUT>(in, subnx, subny, subnz,
			ib, ie, jb, je, kb, ke, buf);
		put_sub_array<memOUT, memOUT>(out, nx, ny, nz,
			posx, posx + (ie - ib),
			posy, posy + (je - jb),
			posz, posz + (ke - kb), buf);

		if (memOUT == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
		if (memOUT == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
	}
	else
		if ((memIN == memGPU) && (memOUT == memGPU)) {
			nse_gpu::copy_sub_array(out, nx, ny, nz, posx, posy, posz,
				in, subnx, subny, subnz, ib, ie, jb, je, kb, ke);
		}
		else
#endif
		{	// memCPU <- memCPU //
			if (omp_in_parallel()) {
				copy_sub_array_omp(out, nx, ny, nz, posx, posy, posz,
					in, subnx, subny, subnz, ib, ie, jb, je, kb, ke);
			}
			else
			{
#pragma omp parallel shared( out ) 
				{
					copy_sub_array_omp(out, nx, ny, nz, posx, posy, posz,
						in, subnx, subny, subnz, ib, ie, jb, je, kb, ke);
				}
			}
		}
}

template< typename T >
void nse::copy_sub_array_omp(T* _RESTRICT out,
	const int nx, const int ny, const int nz,
	const int posx, const int posy, const int posz,
	const T* _RESTRICT const in,
	const int subnx, const int subny, const int subnz,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke)
{
	const int nyz = ny * nz;
	const int subnyz = subny * subnz;
	const int block_size = (ke - kb + 1) * sizeof(T);
	const int shx = posx - ib, shy = posy - jb;

	int i, j, idx, odx;

#ifdef USE_OPENMP_2D_CYCLE
#pragma omp for collapse( 2 ) nowait
#else
#pragma omp for nowait
#endif
	for (i = ib; i <= ie; i++) {
		for (j = jb; j <= je; j++)
		{
			odx = (i + shx) * nyz + (j + shy) * nz + posz;
			idx = i * subnyz + j * subnz + kb;

			memcpy(&out[odx], &in[idx], block_size);
		}
	}
}

// * implementation: get number of colored elements //
// ------------------------------------------------ //
inline int nse::get_num_colored(const int color,
	const int ib, const int ie,
	const int jb, const int je,
	const int kb, const int ke)
{
	const int length = ie - ib + 1;
	const int width = je - jb + 1;
	const int height = ke - kb + 1;

	if ((length & 1) && (width & 1) && (height & 1)) {

		const int sh = !((ib + jb + kb + color) & 1);
		return ((length - 1) >> 1) * width * height +
			((width - 1) >> 1) * height +
			((height + sh) >> 1);

	}
	else
		return ((length * width * height) >> 1);
}