grid3d.h

#pragma once

// [grid3d.h]: 3D primary virtual grid
//
// -------------------------------------------------------------------------------------------- //
// TO DO:
// 

#include "nse-sys.h"
#include "mpi-com3d.h"
#include "cart-sys3d.h"
#include "vecmath.h"

#include "grid-id.h"

#ifndef EXCLUDE_GPU_BRANCH
#include "grid.cuh"
#endif

// use hand-coded openmp array reduction 
//   in -xy averaging without critical sections
//   * - use on Intel Xeon Phi
#define USE_OMP_PAR_REDUCE_IN_AVG_XY

#define USE_GRID3D_BINARY_LOCATE


namespace nse
{
	namespace nse_const3d 
	{
		enum nodeType {
			nodeU = 0, nodeV = 1, nodeW = 2, nodeC = 3,
			nodeUV = 4, nodeUW = 5, nodeVW = 6, nodeUVW = 7
		};
	}

	// --- grid node after averaging
	nse_const3d::nodeType avg_node(
		const nse_const3d::nodeType node,
		const nse_const3d::axisType axis);
	// ------------------------------------------------------------------------------------------------ //


	// * 3D primart grid: Grid3d< T > [ T = float, double ] * //
	// =======================================================================
	template< typename T, memType mem = memCPU >
	class Grid3d
	{
	public:
		Grid3d();
		Grid3d(const Grid3d& grid);
		virtual ~Grid3d();

		// grid parameters by axis
		int dim_size(const nse_const3d::axisType axis) const;
		int mpi_dim_size(const nse_const3d::axisType axis) const;

		int ghost_region_size(	// single layer //
			const nse_const3d::axisType axis) const;	


		// MPI global cell index [== -1 - on failure]
		virtual int mpi_locate_x(const T x) const = 0;
		virtual int mpi_locate_y(const T y) const = 0;
		virtual int mpi_locate_z(const T z) const = 0;

		// local cell index [== -1 - on failure]
		//  - only one MPI process returns >=0 for (x,y,z) calls
		virtual int locate_x(const T x) const;
		virtual int locate_y(const T y) const;
		virtual int locate_z(const T z) const;

		// local local cell index on single MPI process [== -1 - on failure]
		//  - searching in segment [iexp - iwidth, iexp + iwdith]
		virtual int locate_local_x(const T x, const int iexp, const int iwidth) const;
		virtual int locate_local_y(const T y, const int jexp, const int jwidth) const;
		virtual int locate_local_z(const T z, const int kexp, const int kwidth) const;
		//	* search for a list of coordinates
		virtual void locate_local_x(const T* _RESTRICT const x,
			int* _RESTRICT iexp, const int iwidth, const int n) const;
		virtual void locate_local_y(const T* _RESTRICT const y,
			int* _RESTRICT jexp, const int jwidth, const int n) const;
		virtual void locate_local_z(const T* _RESTRICT const z,
			int* _RESTRICT kexp, const int kwidth, const int n) const;

		// (i,j,k) MPI-local coordinates [input - global index]
		//	- only one MPI process returns >= 0
		virtual int i_local_coord(const int i) const;
		virtual int j_local_coord(const int j) const;
		virtual int k_local_coord(const int k) const;

		// interpolation (local relative to (x,y,z) position in processor domain)
		virtual T c_interp(const T* X, const T x, const T y, const T z) const = 0;
		virtual T u_interp(const T* U, const T x, const T y, const T z) const = 0;
		virtual T v_interp(const T* V, const T x, const T y, const T z) const = 0;
		virtual T w_interp(const T* W, const T x, const T y, const T z) const = 0;

		// local interpolation on single MPI process 
		// [unsafe] - not checking if coordinates [(x,y,z),(i,j,k)] are correct
		virtual T u_interp_local(const T* _RESTRICT const U,
			const T x, const T y, const T z, const int i, const int j, const int k) const = 0;
		virtual T v_interp_local(const T* _RESTRICT const V,
			const T x, const T y, const T z, const int i, const int j, const int k) const = 0;
		virtual T w_interp_local(const T* _RESTRICT const W,
			const T x, const T y, const T z, const int i, const int j, const int k) const = 0;

		virtual void u_interp_local(T* _RESTRICT uinterp, const T* _RESTRICT const U,
			const T* _RESTRICT const x, const T* _RESTRICT const y, const T* _RESTRICT const z,
			const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const = 0;
		virtual void v_interp_local(T* _RESTRICT vinterp, const T* _RESTRICT const V,
			const T* _RESTRICT const x, const T* _RESTRICT const y, const T* _RESTRICT const z,
			const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const = 0;
		virtual void w_interp_local(T* _RESTRICT winterp, const T* _RESTRICT const W,
			const T* _RESTRICT const x, const T* _RESTRICT const y, const T* _RESTRICT const z,
			const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const = 0;

		// interpolation (global to (x,y,z) point position)
		virtual T mpi_c_interp(const T* X, const T x, const T y, const T z) const;
		virtual T mpi_u_interp(const T* U, const T x, const T y, const T z) const;
		virtual T mpi_v_interp(const T* V, const T x, const T y, const T z) const;
		virtual T mpi_w_interp(const T* W, const T x, const T y, const T z) const;

		// slicing
		// -xy slice, [C,U,V,W] -> [C,U,V,C]
		void c_slice_at_z(T* Pxy, const T* X, const T z) const;
		void u_slice_at_z(T* Pxy, const T* U, const T z) const;
		void v_slice_at_z(T* Pxy, const T* V, const T z) const;
		void w_slice_at_z(T* Pxy, const T* W, const T z) const;
		// -xz slice, [C,U,V,W] -> [C,U,C,W]
		void c_slice_at_y(T* Pxz, const T* X, const T y) const;
		void u_slice_at_y(T* Pxz, const T* U, const T y) const;
		void v_slice_at_y(T* Pxz, const T* V, const T y) const;
		void w_slice_at_y(T* Pxz, const T* W, const T y) const;
		// -yz slice [C,U,V,W] -> [C,C,V,W]
		void c_slice_at_x(T* Pyz, const T* X, const T x) const;
		void u_slice_at_x(T* Pyz, const T* U, const T x) const;
		void v_slice_at_x(T* Pyz, const T* V, const T x) const;
		void w_slice_at_x(T* Pyz, const T* W, const T x) const;

		// MPI gather-scatter by axis //
		template< memType memOUT = memCPU, memType memIN = memCPU, typename Tin >
		void mpi_gather(Tin* _RESTRICT out, const Tin* _RESTRICT in,
			const int host, const nse_const3d::axisType axis) const;
		template< memType memOUT = memCPU, memType memIN = memCPU, typename Tin >
		void mpi_scatter(Tin* _RESTRICT out, const Tin* _RESTRICT in,
			const int host, const nse_const3d::axisType axis) const;

		// MPI gather coordinates //
		template< memType memOUT = memCPU >
		void mpi_gather_center_coord(T* _RESTRICT out,
			const int host, const nse_const3d::axisType axis) const;
		template< memType memOUT = memCPU >
		void mpi_gather_edge_coord(T* _RESTRICT out,
			const int host, const nse_const3d::axisType axis) const;

		template< memType memOUT = memCPU >
		void mpi_gather_center_coord(
			T* _RESTRICT xout, T* _RESTRICT yout, T* _RESTRICT zout, const int host) const;
		template< memType memOUT = memCPU >
		void mpi_gather_edge_coord(
			T* _RESTRICT xout, T* _RESTRICT yout, T* _RESTRICT zout, const int host) const;


		// grid re-interpolation out(current grid), in(input grid)
		virtual void grid_reinterp(T* Xout, const T* Xin,				// local in array //
			const nse_const3d::nodeType node, const GridId< T >& id) const = 0;

		// GridId on 3D grids //
		virtual void set_id(GridId< T >& id) const;
		virtual bool check_id(const GridId< T >& id) const;

		bool check_id_dims(const GridId< T >& id) const;


		virtual void set_id(GridId< T >& id, const nse_const3d::axisType axis) const;
		virtual bool check_id(const GridId< T >& id, const nse_const3d::axisType axis) const;

		bool check_id_dims(const GridId< T >& id, const nse_const3d::axisType axis) const;

	public:

		mpiCom3d mpi_com;

		int size, nx, ny, nz, nyz;
		int mpi_nx, mpi_ny, mpi_nz,
			mpi_nxy, mpi_nxz, mpi_nyz, mpi_size;

		int gcx, gcy, gcz;

		T x, y, z;
		T length, width, height;

		T mpi_x, mpi_y, mpi_z;
		T mpi_length, mpi_width, mpi_height;

		T *px, *py, *pz;		// cell-center coordinates //
		T *ex, *ey, *ez;		// cell-edge coordinates //
	};
}

// Implementation [misc]
// -------------------------------------------------------------------------------------------- //
inline
nse::nse_const3d::nodeType nse::avg_node(const nse_const3d::nodeType node,
	const nse_const3d::axisType axis)
{
	if (axis == nse_const3d::axisX) {
		if ((node == nse_const3d::nodeU) || (node == nse_const3d::nodeUV) ||
			(node == nse_const3d::nodeUW) || (node == nse_const3d::nodeUVW))
			return nse_const3d::nodeU;

		return nse_const3d::nodeC;
	}
	if (axis == nse_const3d::axisY) {
		if ((node == nse_const3d::nodeV) || (node == nse_const3d::nodeUV) ||
			(node == nse_const3d::nodeVW) || (node == nse_const3d::nodeUVW))
			return nse_const3d::nodeV;

		return nse_const3d::nodeC;
	}
	if (axis == nse_const3d::axisZ) {
		if ((node == nse_const3d::nodeW) || (node == nse_const3d::nodeUW) ||
			(node == nse_const3d::nodeVW) || (node == nse_const3d::nodeUVW))
			return nse_const3d::nodeW;

		return nse_const3d::nodeC;
	}

	if (axis == nse_const3d::axisXY) {
		if ((node == nse_const3d::nodeV) || (node == nse_const3d::nodeVW))
			return nse_const3d::nodeV;
		if ((node == nse_const3d::nodeU) || (node == nse_const3d::nodeUW))
			return nse_const3d::nodeU;
		if ((node == nse_const3d::nodeUV) || (node == nse_const3d::nodeUVW))
			return nse_const3d::nodeUV;

		return nse_const3d::nodeC;
	}
	if (axis == nse_const3d::axisXZ) {
		if ((node == nse_const3d::nodeW) || (node == nse_const3d::nodeVW))
			return nse_const3d::nodeW;
		if ((node == nse_const3d::nodeU) || (node == nse_const3d::nodeUV))
			return nse_const3d::nodeU;
		if ((node == nse_const3d::nodeUW) || (node == nse_const3d::nodeUVW))
			return nse_const3d::nodeUW;

		return nse_const3d::nodeC;
	}
	if (axis == nse_const3d::axisYZ) {
		if ((node == nse_const3d::nodeW) || (node == nse_const3d::nodeUW))
			return nse_const3d::nodeW;
		if ((node == nse_const3d::nodeV) || (node == nse_const3d::nodeUV))
			return nse_const3d::nodeV;
		if ((node == nse_const3d::nodeVW) || (node == nse_const3d::nodeUVW))
			return nse_const3d::nodeVW;

		return nse_const3d::nodeC;
	}

	// axis == nse_const3d::axisXYZ
	return node;
}
// -------------------------------------------------------------------------------------------- //

// Implementation, Grid3d:
// -------------------------------------------------------------------------------------------- //
namespace nse
{
	template< typename T, memType mem >
	Grid3d< T, mem > ::Grid3d(
	) : mpi_com(),
		size(0), nx(0), ny(0), nz(0), nyz(0),
		mpi_size(0), mpi_nx(0), mpi_ny(0), mpi_nz(0),
		mpi_nxy(0), mpi_nxz(0), mpi_nyz(0),
		gcx(0), gcy(0), gcz(0),

		x((T)0), y((T)0), z((T)0),
		length((T)0), width((T)0), height((T)0),
		mpi_x((T)0), mpi_y((T)0), mpi_z((T)0),
		mpi_length((T)0), mpi_width((T)0), mpi_height((T)0)
	{
	}

	template< typename T, memType mem >
	Grid3d< T, mem > ::Grid3d(
		const Grid3d< T, mem >& grid)
		: mpi_com(grid.mpi_com),
		size(grid.size), nx(grid.nx), ny(grid.ny), nz(grid.nz), nyz(grid.nyz),
		mpi_size(grid.size), mpi_nx(grid.mpi_nx), mpi_ny(grid.mpi_ny), mpi_nz(grid.mpi_nz),
		mpi_nxy(grid.mpi_nxy), mpi_nxz(grid.mpi_nxz), mpi_nyz(grid.mpi_nyz),
		gcx(grid.gcx), gcy(grid.gcy), gcz(grid.gcz),

		x(grid.x), y(grid.y), z(grid.z),
		mpi_x(grid.mpi_x), mpi_y(grid.mpi_y), mpi_z(grid.mpi_z),

		length(grid.length), width(grid.width), height(grid.mpi_height),
		mpi_length(grid.mpi_length), mpi_width(grid.mpi_width), mpi_height(grid.mpi_height)
	{
		if (size > 0) {
			allocate<mem>(&px, nx); mcopy<mem, mem>(px, grid.px, nx);
			allocate<mem>(&py, ny); mcopy<mem, mem>(py, grid.py, ny);
			allocate<mem>(&pz, nz); mcopy<mem, mem>(pz, grid.pz, nz);

			allocate<mem>(&ex, nx); mcopy<mem, mem>(ex, grid.ex, nx);
			allocate<mem>(&ey, ny); mcopy<mem, mem>(ey, grid.ey, ny);
			allocate<mem>(&ez, nz); mcopy<mem, mem>(ez, grid.ez, nz);
		}
	}

	template< typename T, memType mem >
	Grid3d< T, mem > :: ~Grid3d(
	)
	{
		if (size > 0) {

			deallocate<mem>(px); deallocate<mem>(py); deallocate<mem>(pz);
			deallocate<mem>(ex); deallocate<mem>(ey); deallocate<mem>(ez);
		}

	}

	template< typename T, memType mem >
	int Grid3d< T, mem > ::dim_size(const nse_const3d::axisType axis) const
	{
		switch (axis)
		{
		case nse_const3d::axisX: return nx;
		case nse_const3d::axisY: return ny;
		case nse_const3d::axisZ: return nz;
		case nse_const3d::axisXY: return nx * ny;
		case nse_const3d::axisXZ: return nx * nz;
		case nse_const3d::axisYZ: return nyz;
		case nse_const3d::axisXYZ: return size;
		default: return -1;
		}
	}

	template< typename T, memType mem >
	int Grid3d< T, mem > ::mpi_dim_size(const nse_const3d::axisType axis) const
	{
		switch (axis)
		{
		case nse_const3d::axisX: return mpi_nx;
		case nse_const3d::axisY: return mpi_ny;
		case nse_const3d::axisZ: return mpi_nz;
		case nse_const3d::axisXY: return mpi_nxy;
		case nse_const3d::axisXZ: return mpi_nxz;
		case nse_const3d::axisYZ: return mpi_nyz;
		case nse_const3d::axisXYZ: return mpi_size;
		default: return -1;
		}
	}

	template< typename T, memType mem >
	int Grid3d< T, mem > ::ghost_region_size(const nse_const3d::axisType axis) const
	{
		switch (axis)
		{
		case nse_const3d::axisX: return gcx;
		case nse_const3d::axisY: return gcy;
		case nse_const3d::axisZ: return gcz;
		case nse_const3d::axisXY: return gcx * gcy;
		case nse_const3d::axisXZ: return gcx * gcz;
		case nse_const3d::axisYZ: return gcy * gcz;
		case nse_const3d::axisXYZ: return gcx * gcy * gcz;
		default: return -1;
		}
	}

	template< typename T, memType mem >
	inline int Grid3d< T, mem > ::locate_x(const T _x) const
	{
#ifndef EXCLUDE_GPU_BRANCH
		if (mem == memGPU) {
			if (mpi_com.rank_x == 0)
				return nse_gpu::locate(_x, ex, nx, gcx, 0);
			else
				return nse_gpu::locate(_x, ex, nx, gcx, 1);
		}
		else
#endif
		{	// memCPU //

#ifdef USE_GRID3D_BINARY_LOCATE
			const int min_binary_size = 16;
#endif

			int i;
			int ibeg = gcx, iend = nx - gcx - 1;


			if (mpi_com.rank_x == 0)
			{
#ifdef USE_GRID3D_BINARY_LOCATE	
				while (iend - ibeg >= min_binary_size) {
					i = (ibeg + iend) / 2;
					if (_x < ex[i]) {
						iend = i - 1; continue;
					}
					if (_x > ex[i + 1]) {
						ibeg = i + 1; continue;
					}

					return i;
				}
#endif
				for (i = ibeg; i <= iend; i++)
					if ((_x >= ex[i]) && (_x <= ex[i + 1])) { return i; }
			}
			else
			{
#ifdef USE_GRID3D_BINARY_LOCATE
				while (iend - ibeg >= min_binary_size) {
					i = (ibeg + iend) / 2;
					if (_x <= ex[i]) {
						iend = i - 1; continue;
					}
					if (_x > ex[i + 1]) {
						ibeg = i + 1; continue;
					}

					return i;
				}
#endif
				for (i = ibeg; i <= iend; i++)
					if ((_x > ex[i]) && (_x <= ex[i + 1])) { return i; }
			}

			return -1;
		}
	}

	template< typename T, memType mem >
	inline int Grid3d< T, mem > ::locate_y(const T _y) const
	{
#ifndef EXCLUDE_GPU_BRANCH
		if (mem == memGPU) {
			if (mpi_com.rank_y == 0)
				return nse_gpu::locate(_y, ey, ny, gcy, 0);
			else
				return nse_gpu::locate(_y, ey, ny, gcy, 1);
		}
		else
#endif
		{	// memCPU //

#ifdef USE_GRID3D_BINARY_LOCATE
			const int min_binary_size = 16;
#endif

			int j;
			int jbeg = gcy, jend = ny - gcy - 1;


			if (mpi_com.rank_y == 0) {
#ifdef USE_GRID3D_BINARY_LOCATE

				while (jend - jbeg >= min_binary_size) {
					j = (jbeg + jend) / 2;
					if (_y < ey[j]) {
						jend = j - 1; continue;
					}
					if (_y > ey[j + 1]) {
						jbeg = j + 1; continue;
					}

					return j;
				}
#endif
				for (j = jbeg; j <= jend; j++)
					if ((_y >= ey[j]) && (_y <= ey[j + 1])) { return j; }
			}
			else
			{
#ifdef USE_GRID3D_BINARY_LOCATE
				while (jend - jbeg >= min_binary_size) {
					j = (jbeg + jend) / 2;
					if (_y <= ey[j]) {
						jend = j - 1; continue;
					}
					if (_y > ey[j + 1]) {
						jbeg = j + 1; continue;
					}

					return j;
				}
#endif
				for (j = jbeg; j <= jend; j++)
					if ((_y > ey[j]) && (_y <= ey[j + 1])) { return j; }
			}

			return -1;
		}
	}

	template< typename T, memType mem >
	inline int Grid3d< T, mem > ::locate_z(const T _z) const
	{
#ifndef EXCLUDE_GPU_BRANCH
		if (mem == memGPU) {
			if (mpi_com.rank_z == 0)
				return nse_gpu::locate(_z, ez, nz, gcz, 0);
			else
				return nse_gpu::locate(_z, ez, nz, gcz, 1);
		}
		else
#endif
		{	// memCPU //

#ifdef USE_GRID3D_BINARY_LOCATE
			const int min_binary_size = 16;
#endif

			int k;
			int kbeg = gcz, kend = nz - gcz - 1;


			if (mpi_com.rank_z == 0) {
#ifdef USE_GRID3D_BINARY_LOCATE
				while (kend - kbeg >= min_binary_size) {
					k = (kbeg + kend) / 2;
					if (_z < ez[k]) {
						kend = k - 1; continue;
					}
					if (_z > ez[k + 1]) {
						kbeg = k + 1; continue;
					}

					return k;
				}
#endif
				for (k = kbeg; k <= kend; k++)
					if ((_z >= ez[k]) && (_z <= ez[k + 1])) { return k; }
			}
			else
			{
#ifdef USE_GRID3D_BINARY_LOCATE
				while (kend - kbeg >= min_binary_size) {
					k = (kbeg + kend) / 2;
					if (_z <= ez[k]) {
						kend = k - 1; continue;
					}
					if (_z > ez[k + 1]) {
						kbeg = k + 1; continue;
					}

					return k;
				}
#endif
				for (k = kbeg; k <= kend; k++)
					if ((_z > ez[k]) && (_z <= ez[k + 1])) { return k; }
			}

			return -1;
		}
	}

	template< typename T, memType mem >
	inline int Grid3d< T, mem > ::locate_local_x(
		const T _x, const int iexp, const int iwidth) const
	{
#ifdef USE_GRID3D_BINARY_LOCATE
		const int min_binary_size = 16;
#endif

		int i;
		int ibeg = max(iexp - iwidth, gcx);
		int iend = min(iexp + iwidth, nx - gcx - 1);

		if (mpi_com.rank_x == 0) {

#ifdef USE_GRID3D_BINARY_LOCATE
			while (iend - ibeg >= min_binary_size) {
				i = (ibeg + iend) / 2;
				if (_x < ex[i]) {
					iend = i - 1; continue;
				}
				if (_x > ex[i + 1]) {
					ibeg = i + 1; continue;
				}

				return i;
			}
#endif
			for (i = ibeg; i <= iend; i++)
				if ((_x >= ex[i]) && (_x <= ex[i + 1])) { return i; }
		}
		else
		{
#ifdef USE_GRID3D_BINARY_LOCATE
			while (iend - ibeg >= min_binary_size) {
				i = (ibeg + iend) / 2;
				if (_x <= ex[i]) {
					iend = i - 1; continue;
				}
				if (_x > ex[i + 1]) {
					ibeg = i + 1; continue;
				}

				return i;
			}
#endif
			for (i = ibeg; i <= iend; i++)
				if ((_x > ex[i]) && (_x <= ex[i + 1])) { return i; }
		}

		return -1;
	}

	template< typename T, memType mem >
	inline int Grid3d< T, mem > ::locate_local_y(
		const T _y, const int jexp, const int jwidth) const
	{
#ifdef USE_GRID3D_BINARY_LOCATE
		const int min_binary_size = 16;
#endif

		int j;
		int jbeg = max(jexp - jwidth, gcy);
		int jend = min(jexp + jwidth, ny - gcy - 1);

		if (mpi_com.rank_y == 0) {
#ifdef USE_GRID3D_BINARY_LOCATE
			while (jend - jbeg >= min_binary_size) {
				j = (jbeg + jend) / 2;
				if (_y < ey[j]) {
					jend = j - 1; continue;
				}
				if (_y > ey[j + 1]) {
					jbeg = j + 1; continue;
				}

				return j;
			}
#endif
			for (j = jbeg; j <= jend; j++)
				if ((_y >= ey[j]) && (_y <= ey[j + 1])) { return j; }
		}
		else
		{
#ifdef USE_GRID3D_BINARY_LOCATE
			while (jend - jbeg >= min_binary_size) {
				j = (jbeg + jend) / 2;
				if (_y <= ey[j]) {
					jend = j - 1; continue;
				}
				if (_y > ey[j + 1]) {
					jbeg = j + 1; continue;
				}

				return j;
			}
#endif
			for (j = jbeg; j <= jend; j++)
				if ((_y > ey[j]) && (_y <= ey[j + 1])) { return j; }
		}

		return -1;
	}

	template< typename T, memType mem >
	inline int Grid3d< T, mem > ::locate_local_z(
		const T _z, const int kexp, const int kwidth) const
	{
#ifdef USE_GRID3D_BINARY_LOCATE
		const int min_binary_size = 16;
#endif

		int k;
		int kbeg = max(kexp - kwidth, gcz);
		int kend = min(kexp + kwidth, nz - gcz - 1);

		if (mpi_com.rank_z == 0) {
#ifdef USE_GRID3D_BINARY_LOCATE
			while (kend - kbeg >= min_binary_size) {
				k = (kbeg + kend) / 2;
				if (_z < ez[k]) {
					kend = k - 1; continue;
				}
				if (_z > ez[k + 1]) {
					kbeg = k + 1; continue;
				}

				return k;
			}
#endif
			for (k = kbeg; k <= kend; k++)
				if ((_z >= ez[k]) && (_z <= ez[k + 1])) { return k; }
		}
		else
		{
#ifdef USE_GRID3D_BINARY_LOCATE
			while (kend - kbeg >= min_binary_size) {
				k = (kbeg + kend) / 2;
				if (_z <= ez[k]) {
					kend = k - 1; continue;
				}
				if (_z > ez[k + 1]) {
					kbeg = k + 1; continue;
				}

				return k;
			}
#endif
			for (k = kbeg; k <= kend; k++)
				if ((_z > ez[k]) && (_z <= ez[k + 1])) { return k; }
		}

		return -1;
	}

	template< typename T, memType mem >
	inline void Grid3d< T, mem > ::locate_local_x(const T* _RESTRICT const _x,
		int* _RESTRICT ipos, const int iwidth, const int n) const
	{
		int m, i;
		int ibeg, iend, idx;

		if (mpi_com.rank_x == 0) {
#pragma omp parallel for private(m, i, ibeg, iend, idx) shared(ipos)
			for (m = 0; m < n; m++) {

				ibeg = max(ipos[m] - iwidth, gcx);
				iend = min(ipos[m] + iwidth, nx - gcx - 1);

				idx = -1;
				for (i = ibeg; i <= iend; i++)
					if ((_x[m] >= ex[i]) && (_x[m] <= ex[i + 1])) { idx = i; break; }
				ipos[m] = idx;
			}
		}
		else
		{
#pragma omp parallel for private(m, i, ibeg, iend, idx) shared(ipos)
			for (m = 0; m < n; m++) {

				ibeg = max(ipos[m] - iwidth, gcx);
				iend = min(ipos[m] + iwidth, nx - gcx - 1);

				idx = -1;
				for (i = ibeg; i <= iend; i++)
					if ((_x[m] > ex[i]) && (_x[m] <= ex[i + 1])) { idx = i; break; }
				ipos[m] = idx;
			}
		}
	}

	template< typename T, memType mem >
	inline void Grid3d< T, mem > ::locate_local_y(const T* _RESTRICT const _y,
		int* _RESTRICT jpos, const int jwidth, const int n) const
	{
		int m, j;
		int jbeg, jend, idx;

		if (mpi_com.rank_y == 0) {
#pragma omp parallel for private(m, j, jbeg, jend, idx) shared(jpos)
			for (m = 0; m < n; m++) {

				jbeg = max(jpos[m] - jwidth, gcy);
				jend = min(jpos[m] + jwidth, ny - gcy - 1);

				idx = -1;
				for (j = jbeg; j <= jend; j++)
					if ((_y[m] >= ey[j]) && (_y[m] <= ey[j + 1])) { idx = j; break; }
				jpos[m] = idx;
			}
		}
		else
		{
#pragma omp parallel for private(m, j, jbeg, jend, idx) shared(jpos)
			for (m = 0; m < n; m++) {

				jbeg = max(jpos[m] - jwidth, gcy);
				jend = min(jpos[m] + jwidth, ny - gcy - 1);

				idx = -1;
				for (j = jbeg; j <= jend; j++)
					if ((_y[m] > ey[j]) && (_y[m] <= ey[j + 1])) { idx = j; break; }
				jpos[m] = idx;
			}
		}
	}

	template< typename T, memType mem >
	inline void Grid3d< T, mem > ::locate_local_z(const T* _RESTRICT const _z,
		int* _RESTRICT kpos, const int kwidth, const int n) const
	{
		int m, k;
		int kbeg, kend, idx;

		if (mpi_com.rank_z == 0) {
#pragma omp parallel for private(m, k, kbeg, kend, idx) shared(kpos)
			for (m = 0; m < n; m++) {

				kbeg = max(kpos[m] - kwidth, gcz);
				kend = min(kpos[m] + kwidth, nz - gcz - 1);

				idx = -1;
				for (k = kbeg; k <= kend; k++)
					if ((_z[m] >= ez[k]) && (_z[m] <= ez[k + 1])) { idx = k; break; }
				kpos[m] = idx;
			}
		}
		else
		{
#pragma omp parallel for private(m, k, kbeg, kend, idx) shared(kpos)
			for (m = 0; m < n; m++) {

				kbeg = max(kpos[m] - kwidth, gcz);
				kend = min(kpos[m] + kwidth, nz - gcz - 1);

				idx = -1;
				for (k = kbeg; k <= kend; k++)
					if ((_z[m] > ez[k]) && (_z[m] <= ez[k + 1])) { idx = k; break; }
				kpos[m] = idx;
			}
		}
	}

	template< typename T, memType mem >
	int Grid3d< T, mem > ::i_local_coord(const int i) const
	{
		if ((i < 0) & (i > mpi_nx - 1)) return -1;

		const int shmx = (mpi_com.rank_x == 0) ? gcx : 0;
		const int shpx = (mpi_com.rank_x == mpi_com.size_x - 1) ? gcx : 0;

		int ip = i - par_local_offset(mpi_nx, gcx,
			mpi_com.rank_x, mpi_com.size_x);

		return ((ip >= gcx - shmx) && (ip < nx - gcx + shpx)) ? ip : -1;
	}

	template< typename T, memType mem >
	int Grid3d< T, mem > ::j_local_coord(const int j) const
	{
		if ((j < 0) && (j > mpi_ny - 1)) return -1;

		const int shmy = (mpi_com.rank_y == 0) ? gcy : 0;
		const int shpy = (mpi_com.rank_y == mpi_com.size_y - 1) ? gcy : 0;

		int jp = j - par_local_offset(mpi_ny, gcy,
			mpi_com.rank_y, mpi_com.size_y);

		return ((jp >= gcy - shmy) && (jp < ny - gcy + shpy)) ? jp : -1;
	}

	template< typename T, memType mem >
	int Grid3d< T, mem > ::k_local_coord(const int k) const
	{
		if ((k < 0) && (k > mpi_nz - 1)) return -1;

		const int shmz = (mpi_com.rank_z == 0) ? gcz : 0;
		const int shpz = (mpi_com.rank_z == mpi_com.size_z - 1) ? gcz : 0;

		int kp = k - par_local_offset(mpi_nz, gcz,
			mpi_com.rank_z, mpi_com.size_z);

		return ((kp >= gcz - shmz) && (kp < nz - gcz + shpz)) ? kp : -1;
	}

	template< typename T, memType mem >
	T Grid3d< T, mem > ::mpi_c_interp(const T* X, const T _px, const T _py, const T _pz) const
	{
		return mpi_allreduce(
			c_interp(X, _px, _py, _pz), MPI_SUM, mpi_com.comm);
	}

	template< typename T, memType mem >
	T Grid3d< T, mem > ::mpi_u_interp(const T* U, const T _px, const T _py, const T _pz) const
	{
		return mpi_allreduce(
			u_interp(U, _px, _py, _pz), MPI_SUM, mpi_com.comm);
	}

	template< typename T, memType mem >
	T Grid3d< T, mem > ::mpi_v_interp(const T* V, const T _px, const T _py, const T _pz) const
	{
		return mpi_allreduce(
			v_interp(V, _px, _py, _pz), MPI_SUM, mpi_com.comm);
	}

	template< typename T, memType mem >
	T Grid3d< T, mem > ::mpi_w_interp(const T* W, const T _px, const T _py, const T _pz) const
	{
		return mpi_allreduce(
			w_interp(W, _px, _py, _pz), MPI_SUM, mpi_com.comm);
	}

	template< typename T, memType mem >
	void Grid3d< T, mem >::c_slice_at_z(T* Pxy, const T* X, const T _pz) const
	{
		int i, j, k = locate_z(_pz);
		int index, host_rank = -1;

		null(Pxy, nx * ny);
		if ((k >= gcz) && (k < nz - gcz)) {

			const int kpos = (_pz < pz[k]) ? k : k + 1;
			const T alpha = (_pz - pz[kpos - 1]) / (pz[kpos] - pz[kpos - 1]);

#pragma omp parallel for private( i, j, index ) shared( Pxy )
			for (i = gcx; i < nx - gcx; i++) {
				for (j = gcy; j < ny - gcy; j++) {
					index = i * nyz + j * nz + kpos;

					Pxy[i * ny + j] = ((T)1.0 - alpha) * X[index - 1] + alpha * X[index];
				}
			}

			MPI_Comm_rank(mpi_com.comm_z, &host_rank);
		}

		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_z);
		if (host_rank >= 0)
			mpi_broadcast(Pxy, nx * ny, host_rank, mpi_com.comm_z);
	}

	template< typename T, memType mem >
	void Grid3d< T, mem >::u_slice_at_z(T* Pxy, const T* U, const T _pz) const
	{
		int i, j, k = locate_z(_pz);
		int index, host_rank = -1;

		null(Pxy, nx * ny);
		if ((k >= gcz) && (k < nz - gcz)) {

			const int kpos = (_pz < pz[k]) ? k : k + 1;
			const T alpha = (_pz - pz[kpos - 1]) / (pz[kpos] - pz[kpos - 1]);

#pragma omp parallel for private( i, j, index ) shared( Pxy )
			for (i = gcx; i < nx - gcx + 1; i++) {
				for (j = gcy; j < ny - gcy; j++) {
					index = i * nyz + j * nz + kpos;

					Pxy[i * ny + j] = ((T)1.0 - alpha) * U[index - 1] + alpha * U[index];
				}
			}

			MPI_Comm_rank(mpi_com.comm_z, &host_rank);
		}

		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_z);
		if (host_rank >= 0)
			mpi_broadcast(Pxy, nx * ny, host_rank, mpi_com.comm_z);
	}

	template< typename T, memType mem >
	void Grid3d< T, mem >::v_slice_at_z(T* Pxy, const T* V, const T _pz) const
	{
		int i, j, k = locate_z(_pz);
		int index, host_rank = -1;

		null(Pxy, nx * ny);
		if ((k >= gcz) && (k < nz - gcz)) {

			const int kpos = (_pz < pz[k]) ? k : k + 1;
			const T alpha = (_pz - pz[kpos - 1]) / (pz[kpos] - pz[kpos - 1]);

#pragma omp parallel for private( i, j, index ) shared( Pxy )
			for (i = gcx; i < nx - gcx; i++) {
				for (j = gcy; j < ny - gcy + 1; j++) {
					index = i * nyz + j * nz + kpos;

					Pxy[i * ny + j] = ((T)1.0 - alpha) * V[index - 1] + alpha * V[index];
				}
			}

			MPI_Comm_rank(mpi_com.comm_z, &host_rank);
		}

		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_z);
		if (host_rank >= 0)
			mpi_broadcast(Pxy, nx * ny, host_rank, mpi_com.comm_z);
	}

	template< typename T, memType mem >
	void Grid3d< T, mem >::w_slice_at_z(T* Pxy, const T* W, const T _pz) const
	{
		int i, j, k = locate_z(_pz);
		int index, host_rank = -1;

		null(Pxy, nx * ny);
		if ((k >= gcz) && (k < nz - gcz))
		{
			const int kpos = k + 1;
			const T alpha = (_pz - ez[kpos - 1]) / (ez[kpos] - ez[kpos - 1]);

#pragma omp parallel for private( i, j, index ) shared( Pxy )
			for (i = gcx; i < nx - gcx; i++) {
				for (j = gcy; j < ny - gcy; j++) {
					index = i * nyz + j * nz + kpos;

					Pxy[i * ny + j] = ((T)1.0 - alpha) * W[index - 1] + alpha * W[index];
				}
			}

			MPI_Comm_rank(mpi_com.comm_z, &host_rank);
		}