diff --git a/bin-named-stamp.h b/bin-named-stamp.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb7ce3723b1e35038eb5edb5e45fbecda3f04da8
--- /dev/null
+++ b/bin-named-stamp.h
@@ -0,0 +1,278 @@
+#pragma once
+
+// [bin-named-stamp.h]: binNamedStamp data structure for file I/O
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "grid-id.h"
+#include <stdio.h>
+#include <string>
+
+namespace nse
+{
+	template< typename T >
+	class binNamedStamp {
+	public:
+		binNamedStamp();
+		~binNamedStamp();
+
+
+		bool get(const int idx, T* out) const;
+		bool get(const std::string& name, T* out) const;
+
+		int get_size() const { return size; }
+
+		int get_record_size() const;
+
+		bool update(const int idx, T* out) const;
+		bool update(const std::string& name, T* out) const;
+
+		void push(const std::string& name, const T in);
+
+		int fwrite(FILE *ptr) const;
+		int fread(FILE* ptr);
+
+		void mpi_broadcast(const int host, const MPI_Comm comm);
+
+
+		void debug_print() const;
+
+	private:
+		int mem_size, size;
+		T *value;
+		std::string *name;
+
+		static const int c_alloc_step = 16;
+
+		void allocate(const int msize);
+		void resize(const int msize);
+	};
+}
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+nse::binNamedStamp< T >::binNamedStamp() : mem_size(0), size(0) {}
+template< typename T >
+nse::binNamedStamp< T >::~binNamedStamp()
+{
+	if (mem_size > 0) delete[] value;
+
+	mem_size = 0;
+	size = 0;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+bool nse::binNamedStamp< T >::get(const int idx, T* out) const
+{
+	if ((idx < 0) || (idx >= size)) return false;
+
+	(*out) = value[idx];
+	return true;
+}
+template< typename T >
+bool nse::binNamedStamp< T >::get(const std::string& _name, T* out) const
+{
+	for (int k = 0; k < size; k++) {
+		if (name[k] == _name) {
+			(*out) = value[k];
+			return true;
+		}
+	}
+	return false;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+bool nse::binNamedStamp< T >::update(const int idx, T* out) const
+{
+	if ((idx < 0) || (idx >= size)) return false;
+
+	(*out) += value[idx];
+	return true;
+}
+
+template< typename T >
+bool nse::binNamedStamp< T >::update(const std::string& _name, T* out) const
+{
+	for (int k = 0; k < size; k++) {
+		if (name[k] == _name) {
+			(*out) += value[k];
+			return true;
+		}
+	}
+
+	return false;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::binNamedStamp< T >::push(const std::string& _name, const T in)
+{
+	if (_name.empty()) return;
+	for (int k = 0; k < size; k++) {
+		if (name[k] == _name) {
+			value[k] = in;
+			return;
+		}
+	}
+
+
+	resize(size + 1);
+
+	value[size] = in;
+	name[size] = _name;
+	size++;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::binNamedStamp< T >::allocate(const int req_size)
+{
+	if (req_size > mem_size)
+	{
+		int alloc_size = (req_size > mem_size + c_alloc_step) ?
+			req_size : mem_size + c_alloc_step;
+
+		if (mem_size > 0) {
+			delete[] value;
+			delete[] name;
+		}
+		value = new T[alloc_size];
+		name = new std::string[alloc_size];
+		mem_size = alloc_size;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::binNamedStamp< T >::resize(const int req_size)
+{
+	if (req_size > mem_size)
+	{
+		int alloc_size = (req_size > mem_size + c_alloc_step) ?
+			req_size : mem_size + c_alloc_step;
+
+		T *cpval = new T[alloc_size];
+		std::string *cpname = new std::string[alloc_size];
+		if (size > 0)
+			memcpy(cpval, value, size * sizeof(T));
+		for (int k = 0; k < size; k++)
+			cpname[k] = name[k];
+		if (mem_size > 0) {
+			delete[] value;
+			delete[] name;
+		}
+
+		value = cpval;
+		name = cpname;
+		mem_size = alloc_size;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+int nse::binNamedStamp< T >::fwrite(FILE *ptr) const
+{
+	int status = 0;
+
+	status += ::fwrite(&size, sizeof(int), 1, ptr);
+	if (size > 0) {
+		status += ::fwrite(value, sizeof(T), size, ptr);
+	}
+
+	for (int k = 0; k < size; k++) {
+		int c_length = strlen(name[k].c_str());
+		status += ::fwrite(&c_length, sizeof(int), 1, ptr);
+		status += ::fwrite(name[k].c_str(), sizeof(char), c_length, ptr);
+	}
+
+	return status;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+int nse::binNamedStamp< T >::fread(FILE* ptr)
+{
+	int status = 0;
+
+	status += ::fread(&size, sizeof(int), 1, ptr);
+	allocate(size);
+
+	if (size > 0) {
+		status += ::fread(value, sizeof(T), size, ptr);
+	}
+
+	for (int k = 0; k < size; k++)
+	{
+		int c_length;
+		status += ::fread(&c_length, sizeof(int), 1, ptr);
+
+		char *c_buf = new char[c_length + 1];
+		status += ::fread(c_buf, sizeof(char), c_length, ptr);
+		c_buf[c_length] = '\0';
+
+		name[k] = std::string(c_buf);
+
+		delete[] c_buf;
+	}
+
+	return status;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+int nse::binNamedStamp< T >::get_record_size() const
+{
+	int recsize = size + 1;
+
+	for (int k = 0; k < size; k++) {
+		int name_length = strlen(name[k].c_str());
+		recsize += name_length + 1;
+	}
+
+	return recsize;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::binNamedStamp< T >::mpi_broadcast(
+	const int host, const MPI_Comm comm)
+{
+	int mpi_rank;
+	MPI_Comm_rank(comm, &mpi_rank);
+
+	MPI_Bcast(&size, 1, MPI_INT, host, comm);
+	if (mpi_rank != host) allocate(size);
+
+	if (size > 0) {
+		MPI_Bcast(value, size, mpi_type< T >(), host, comm);
+	}
+
+	for (int k = 0; k < size; k++) {
+		int c_length = 0;
+		if (mpi_rank == host) c_length = strlen(name[k].c_str());
+		MPI_Bcast(&c_length, 1, MPI_INT, host, comm);
+
+		char *c_buf = new char[c_length + 1];
+		strcpy(c_buf, name[k].c_str());
+		MPI_Bcast(c_buf, c_length + 1, MPI_CHAR, host, comm);
+
+		if (mpi_rank != host) {
+			name[k] = std::string(c_buf);
+		}
+		delete[] c_buf;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::binNamedStamp< T >::debug_print() const
+{
+	for (int k = 0; k < size; k++) {
+		printf(" idx = %i, name = %s, value = %.5f\n", k, name[k].c_str(), value[k]);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/bin-stamp.h b/bin-stamp.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba267eacdb14931dec049c9869dd7966d33af576
--- /dev/null
+++ b/bin-stamp.h
@@ -0,0 +1,182 @@
+#pragma once
+
+// [bin-stamp.h]: binStamp data structure for file I/O
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "grid-id.h"
+#include <stdio.h>
+
+namespace nse
+{
+	template< typename T >
+	class binStamp {
+
+	public:
+		binStamp();
+		~binStamp();
+
+
+		bool get(const int idx, T* out) const;
+		bool update(const int idx, T* out) const;
+
+		void push(const T in);
+
+		int fwrite(FILE *ptr) const;
+
+		int fread(FILE* ptr);
+#ifdef _USE_DEPRECATED_WST_FORMAT
+		int fread(FILE* ptr, const int _size);
+#endif
+
+		void mpi_broadcast(const int host, const MPI_Comm comm);
+
+		// read-only: //
+		int mem_size, size;
+		T *value;
+
+	private:
+		static const int c_alloc_step = 16;
+
+		void allocate(const int msize);
+		void resize(const int msize);
+	};
+}
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+nse::binStamp< T >::binStamp() : mem_size(0), size(0) {}
+template< typename T >
+nse::binStamp< T >::~binStamp()
+{
+	if (mem_size > 0) delete[] value;
+
+	mem_size = 0;
+	size = 0;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+bool nse::binStamp< T >::get(const int idx, T* out) const
+{
+	if ((idx < 0) || (idx >= size)) return false;
+
+	(*out) = value[idx];
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+bool nse::binStamp< T >::update(const int idx, T* out) const
+{
+	if ((idx < 0) || (idx >= size)) return false;
+
+	(*out) += value[idx];
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::binStamp< T >::push(const T in)
+{
+	resize(size + 1);
+
+	value[size] = in;
+	size++;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::binStamp< T >::allocate(const int req_size)
+{
+	if (req_size > mem_size)
+	{
+		int alloc_size = (req_size > mem_size + c_alloc_step) ?
+			req_size : mem_size + c_alloc_step;
+
+		if (mem_size > 0) delete[] value;
+		value = new T[alloc_size];
+		mem_size = alloc_size;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::binStamp< T >::resize(const int req_size)
+{
+	if (req_size > mem_size)
+	{
+		int alloc_size = (req_size > mem_size + c_alloc_step) ?
+			req_size : mem_size + c_alloc_step;
+
+		T *cpval = new T[alloc_size];
+		if (size > 0) memcpy(cpval, value, size * sizeof(T));
+		if (mem_size > 0) delete[] value;
+
+		value = cpval;
+		mem_size = alloc_size;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+int nse::binStamp< T >::fwrite(FILE *ptr) const
+{
+	int status = 0;
+
+	status += ::fwrite(&size, sizeof(int), 1, ptr);
+	if (size > 0)
+		status += ::fwrite(value, sizeof(T), size, ptr);
+
+	return status;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+int nse::binStamp< T >::fread(FILE* ptr)
+{
+	int status = 0;
+
+	status += ::fread(&size, sizeof(int), 1, ptr);
+	allocate(size);
+
+	if (size > 0)
+		status += ::fread(value, sizeof(T), size, ptr);
+
+	return status;
+}
+// -------------------------------------------------------------------------------------------- //
+
+#ifdef _USE_DEPRECATED_WST_FORMAT
+template< typename T >
+int nse::binStamp< T >::fread(FILE* ptr, const int _size)
+{
+	int status = 0;
+
+	size = _size;
+	allocate(size);
+
+	if (size > 0)
+		status += ::fread(value, sizeof(T), size, ptr);
+
+	return status;
+}
+// -------------------------------------------------------------------------------------------- //
+#endif
+
+template< typename T >
+void nse::binStamp< T >::mpi_broadcast(
+	const int host, const MPI_Comm comm)
+{
+	int mpi_rank;
+	MPI_Comm_rank(comm, &mpi_rank);
+
+	MPI_Bcast(&size, 1, MPI_INT, host, comm);
+	if (mpi_rank != host) allocate(size);
+
+	if (size > 0)
+		MPI_Bcast(value, size, mpi_type< T >(), host, comm);
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/bl-flux-def.h b/bl-flux-def.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2b00c8572d3a0d8ef631be741c57adf2ad5c260
--- /dev/null
+++ b/bl-flux-def.h
@@ -0,0 +1,975 @@
+#pragma once
+
+// [bl-flux-def.h]: boundary-layer turbulent fluxes definitions
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "wstgrid3d.h"
+
+
+
+namespace nse
+{
+	// U,V,W,P 2nd order moments
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void uv_flux(T* flux,		// node: [C || V]
+
+		const T* const UV,		// node: [C || V]
+		const T* const U,		// node: [C || C]
+		const T* const V,		// node: [C || V]
+		const nse_const3d::axisType axis, const wstGrid3d< T >& grid);	// [axisZ || axisYZ]
+
+	template< typename T >
+	void uw_flux(T* flux,		// node: [W]
+
+		const T* const UW,		// node: [W]
+		const T* const U,		// node: [C]
+		const T* const W,		// node: [W]
+		const nse_const3d::axisType axis, const wstGrid3d< T >& grid);	// [axisZ || axisYZ]
+
+	template< typename T >
+	void vw_flux(T* flux,		// node: [W || VW]
+
+		const T* const VW,		// node: [W || VW]
+		const T* const V,		// node: [C || V]
+		const T* const W,		// node: [W || W]
+		const nse_const3d::axisType axis, const wstGrid3d< T >& grid);	// [axisZ || axisYZ]
+
+
+	template< typename T >
+	void pu_flux(T* flux,		// node: [C]
+
+		const T* const PU,		// node: [C]
+		const T* const P,		// node: [C]
+		const T* const U,		// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void pv_flux(T* flux,		// node: [C]
+
+		const T* const PV,		// node: [C]
+		const T* const P,		// node: [C]
+		const T* const V,		// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void pw_flux(T* flux,		// node: [W]
+
+		const T* const PW,		// node: [W]
+		const T* const P,		// node: [C]
+		const T* const W,		// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- /
+
+	// U,V,W,C 2nd order moments
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void cu_flux(T* flux,		// node: [C]
+
+		const T* const CU,		// node: [C]
+		const T* const C,		// node: [C]
+		const T* const U,		// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cv_flux(T* flux,		// node: [C]
+
+		const T* const CV,		// node: [C]
+		const T* const C,		// node: [C]
+		const T* const V,		// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cw_flux(T* flux,		// node: [W]
+
+		const T* const CW,		// node: [W]
+		const T* const C,		// node: [C]
+		const T* const W,		// node: [W]
+		const nse_const3d::axisType axis, const wstGrid3d< T >& grid);	// [axisZ || axisYZ]
+
+	template< typename T >
+	void cc_flux(T* flux,		// node: [C]
+
+		const T* const CC,		// node: [C]
+		const T* const Ca,		// node: [C]
+		const T* const Cb,		// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// [U^2,V^2,W^2] * W - fluxes
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void u2w_flux(T* flux,		// node: [W]
+
+		const T* const U2W,		// node: [W]
+		const T* const U2,		// node: [W]
+		const T* const UW,		// node: [W]
+		const T* const UW_adv,	// node: [W]
+		const T* const U,	 	// node: [C]
+		const T* const W,		// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void v2w_flux(T* flux,		// node: [W]
+
+		const T* const V2W,		// node: [W]
+		const T* const V2,		// node: [W]
+		const T* const VW,		// node: [W]
+		const T* const VW_adv,	// node: [W]
+		const T* const V,		// node: [C]
+		const T* const W,		// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void w2w_flux(T* flux,		// node: [C]
+
+		const T* const W3,		// node: [C]
+		const T* const W2_c,	// node: [C]
+		const T* const W2_w,	// node: [W]
+		const T* const W,		// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// [C^2] * W - fluxes
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void c2w_flux(T* flux,		// node: [W]
+
+		const T* const C2W,		// node: [W]
+		const T* const C2,		// node: [W]
+		const T* const CW,		// node: [W]
+		const T* const CW_adv,	// node: [W]
+		const T* const C,		// node: [C]
+		const T* const W,		// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// [UV, UW, VW] * W - fluxes
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void uvw_flux(T* flux,				// node: [W]
+
+		const T* const UVW,				// node: [W]
+		const T* const UW,				// node: [W]
+		const T* const VW,				// node: [W]
+		const T* const UV_uvw,			// node: [W]
+		const T* const UW_uvw,			// node: [W]
+		const T* const VW_uvw,			// node: [W]
+		const T* const U,				// node: [C]
+		const T* const V,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uww_flux(T* flux,				// node: [C]
+		
+		const T* const UWW,				// node: [C]
+		const T* const W2_w,			// node: [W]
+		const T* const W2_c,			// node: [C]
+		const T* const W2_u,			// node: [C]
+		const T* const W2_uw,			// node: [W]
+		const T* const UW,				// node: [W]
+		const T* const UW_bottom_uw,	// node: [W (C -- W)]
+		const T* const UW_top_uw,		// node: [W (C -- W)]
+		const T* const U,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vww_flux(T* flux,				// node: [C]
+
+		const T* const VWW,				// node: [C]
+		const T* const W2_w,			// node: [W]
+		const T* const W2_c,			// node: [C]
+		const T* const W2_v,			// node: [C]
+		const T* const W2_vw,			// node: [W]
+		const T* const VW,				// node: [W]
+		const T* const VW_bottom_vw,	// node: [W (C -- W)]
+		const T* const VW_top_vw,		// node: [W (C -- W)]
+		const T* const V,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// [CU, CV, CW] * W - fluxes
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void cuw_flux(T* flux,				// node: [W]
+
+		const T* const CUW,				// node: [W]
+		const T* const UW,				// node: [W]
+		const T* const CU_uw,			// node: [W]
+		const T* const CW,				// node: [W]
+		const T* const CW_uw,			// node: [W]
+		const T* const C,				// node: [C]
+		const T* const U,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cvw_flux(T* flux,				// node: [W]
+
+		const T* const CVW,				// node: [W]
+		const T* const VW,				// node: [W]
+		const T* const CV_vw,			// node: [W]
+		const T* const CW,				// node: [W]
+		const T* const CW_vw,			// node: [W]
+		const T* const C,				// node: [C]
+		const T* const V,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cww_flux(T* flux,				// node: [C]
+
+		const T* const CWW,				// node: [C]
+		const T* const W2_w,			// node: [W]
+		const T* const W2_c,			// node: [C]
+		const T* const CW,				// node: [W]
+		const T* const CW_bottom_w,		// node: [W (C -- W)]
+		const T* const CW_top_w,		// node: [W (C -- W)]
+		const T* const C,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// c - pressure gradient covariances
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void c_w_pressure_gradient_turb(T* c_dpdz_turb,		// node: [W]
+
+		const T* const C_dPdz,							// node: [W]
+		const T* const C,								// node: [C]
+		const T* const Pressure,						// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+}
+
+
+// U,V,W,P 2nd order moments
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::uv_flux(
+	T* flux,					// node: [C || V]
+
+	const T* const UV,			// node: [C || V]
+	const T* const U,			// node: [C || C]
+	const T* const V,			// node: [C || V]
+	const nse_const3d::axisType axis, const wstGrid3d< T >& grid)	// [axisZ || axisYZ]
+	// ____   __   _   _ 
+	// u'v' = UV - U * V
+{
+	if (axis == nse_const3d::axisZ) {
+
+		int k;
+#pragma omp parallel for private(k) shared(flux)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			flux[k] = UV[k] - U[k] * V[k];
+		}
+		return;
+	}
+	if (axis == nse_const3d::axisYZ) {
+		// [UV, V] averages have to be known at all [V] nodes, including walls
+		// [U] average has to be known at all [C] nodes and ghost nodes: (j + 1/2), (j - 1/2)
+
+		int j, k, idx;
+#pragma omp parallel for private(j, k, idx) shared(flux)
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)	// all [V] nodes
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+				idx = j * grid.nz + k;
+				flux[idx] = UV[idx] - (T) 0.5 * (U[idx] + U[idx - grid.nz]) * V[idx];
+			}
+		return;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::uw_flux(
+	T* flux,					// node: [W]
+
+	const T* const UW,			// node: [W]
+	const T* const U,			// node: [C]
+	const T* const W,			// node: [W]
+	const nse_const3d::axisType axis, const wstGrid3d< T >& grid)	// [axisZ || axisYZ]
+	// ____   __   _   _ 
+	// u'w' = UW - U * W
+	// [UW, W] averages have to be known at all [W] nodes, including walls
+	// [U] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	if (axis == nse_const3d::axisZ) {
+
+		int k;
+#pragma omp parallel for private(k) shared(flux)
+		for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+			flux[k] = UW[k] - (T) 0.5 * (U[k] + U[k - 1]) * W[k];
+		}
+		return;
+	}
+	if (axis == nse_const3d::axisYZ) {
+
+		int j, k, idx;
+#pragma omp parallel for private(j, k, idx) shared(flux)
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+				idx = j * grid.nz + k;
+				flux[idx] = UW[idx] - (T) 0.5 * (U[idx] + U[idx - 1]) * W[idx];
+			}
+		return;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::vw_flux(
+	T* flux,					// node: [W || VW]
+
+	const T* const VW,			// node: [W || VW]
+	const T* const V,			// node: [C || V]
+	const T* const W,			// node: [W || W]
+
+	const nse_const3d::axisType axis, const wstGrid3d< T >& grid)	// [axisZ || axisYZ]
+	// ____   __   _   _ 
+	// v'w' = VW - V * W
+{
+	if (axis == nse_const3d::axisZ) {
+		// [VW, W] averages have to be known at all [W] nodes, including walls
+		// [V] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+
+		int k;
+#pragma omp parallel for private(k) shared(flux)
+		for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+			flux[k] = VW[k] - (T) 0.5 * (V[k] + V[k - 1]) * W[k];
+		}
+		return;
+	}
+	if (axis == nse_const3d::axisYZ) {
+		// [VW] average has to be known at all [VW] nodes, including walls
+		// [W] average has to be known at all [W] nodes and ghost nodes: (j + 1/2), (j - 1/2)
+		// [V] average has to be known at all [V] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+
+		int j, k, idx;
+#pragma omp parallel for private(j, k, idx) shared(flux)
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)	// all [V] nodes
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+				idx = j * grid.nz + k;
+				flux[idx] = VW[idx] - (T) 0.25 *
+					(V[idx] + V[idx - 1]) * (W[idx] + W[idx - grid.nz]);
+			}
+		return;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::pu_flux(T* flux,	// node: [C]
+
+	const T* const PU,		// node: [C]
+	const T* const P,		// node: [C]
+	const T* const U,		// node: [C]
+	const wstGrid3d< T >& grid)
+	// ____   __   _   _ 
+	// p'u' = PU - P * U
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		flux[k] = PU[k] - P[k] * U[k];
+	}
+}
+
+template< typename T >
+void nse::pv_flux(T* flux,	// node: [C]
+
+	const T* const PV,		// node: [C]
+	const T* const P,		// node: [C]
+	const T* const V,		// node: [C]
+	const wstGrid3d< T >& grid)
+	// ____   __   _   _ 
+	// p'v' = PV - P * V
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		flux[k] = PV[k] - P[k] * V[k];
+	}
+}
+
+template< typename T >
+void nse::pw_flux(
+	T* flux,						// node: [W]
+
+	const T* const PW,				// node: [W]
+	const T* const P,				// node: [C]
+	const T* const W,				// node: [W]
+	const wstGrid3d< T >& grid)
+	// ____   __   _   _ 
+	// p'w' = PW - P * W
+	// ____     __ 
+	// p'w' and PW - are defined at [W] node for the following reason:
+	// for staggered grid interpolation to [C] node in TKE equation results in: 
+	//	______ 1z        ___ 1z
+	//	    dp     d(w *  p )        dw
+	//	w * --  =  --------    - p * --
+	//	    dz        dz             dz
+	//
+	// [PW, W] averages have to be known at all [W] nodes, including walls
+	// [P] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		flux[k] = PW[k] - (T) 0.5 * (P[k] + P[k - 1]) * W[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// U,V,W,C 2nd order moments
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::cu_flux(
+	T* flux,					// node: [C]
+
+	const T* const CU,			// node: [C]
+	const T* const C,			// node: [C]
+	const T* const U,			// node: [C]
+	const wstGrid3d< T >& grid)
+	// ____   __   _   _ 
+	// c'u' = CU - C * U
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		flux[k] = CU[k] - C[k] * U[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::cv_flux(
+	T* flux,					// -z [C] node
+
+	const T* const CV,			// -z [C] node
+	const T* const C,			// -z [C] node
+	const T* const V,			// -z [C] node
+	const wstGrid3d< T >& grid)
+	// ____   __   _   _ 
+	// c'v' = CV - C * V
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		flux[k] = CV[k] - C[k] * V[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::cw_flux(
+	T* flux,					// node: [W]
+
+	const T* const CW,			// node: [W]
+	const T* const C,			// node: [C]
+	const T* const W,			// node: [W]
+	const nse_const3d::axisType axis, const wstGrid3d< T >& grid)	// [axisZ || axisYZ]
+	// ____   __   _   _ 
+	// c'w' = CW - C * W
+	// [CW, W] averages have to be known at all [W] nodes, including walls
+	// [C] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	if (axis == nse_const3d::axisZ) {
+
+		int k;
+#pragma omp parallel for private(k) shared(flux)
+		for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+			flux[k] = CW[k] - (T) 0.5 * (C[k] + C[k - 1]) * W[k];
+		}
+		return;
+	}
+	if (axis == nse_const3d::axisYZ) {
+
+		int j, k, idx;
+#pragma omp parallel for private(j, k, idx) shared(flux)
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+				idx = j * grid.nz + k;
+				flux[idx] = CW[idx] - (T) 0.5 * (C[idx] + C[idx - 1]) * W[idx];
+			}
+		return;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::cc_flux(T* flux,		// node: [C]
+
+	const T* const CC,		// node: [C]
+	const T* const Ca,		// node: [C]
+	const T* const Cb,		// node: [C]
+	const wstGrid3d< T >& grid)
+	// ______   __   __   __ 
+	// ca'cb' = CC - Cb * Ca
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		flux[k] = CC[k] - Ca[k] * Cb[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// [U^2,V^2,W^2] * W - fluxes
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::u2w_flux(
+	T* flux,					// node: [W]
+
+	const T* const U2W,			// node: [W]
+	const T* const U2,			// node: [W]
+	const T* const UW,			// node: [W]
+	const T* const UW_adv,		// node: [W]
+	const T* const U,	 		// node: [C]
+	const T* const W,			// node: [W]
+	const wstGrid3d< T >& grid)
+	// ______   ___   __   _       _   __       _   _   _
+	// u'u'w' = UUW - UU * W - 2 * U * UW + 2 * U * U * W
+	//                ___     _   _   _
+	// calculation of UUW and U * U * W product:
+	//	~~~~~1z   __1x
+	//  (U * U) * W
+	//
+	// [U2W, U2, UW, UW_adv, W] averages have to be known at all [W] nodes, including walls
+	// [U] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		flux[k] =
+			U2W[k] - U2[k] * W[k] - (U[k] + U[k - 1]) * UW[k] +
+			(T) 2.0 * (U[k] * U[k - 1]) * W[k] +
+
+			// correction term:
+			UW_adv[k] * (U[k] - U[k - 1]) * grid.dzh[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::v2w_flux(
+	T* flux,				// node: [W]
+
+	const T* const V2W,		// node: [W]
+	const T* const V2,		// node: [W]
+	const T* const VW,		// node: [W]
+	const T* const VW_adv,	// node: [W]
+	const T* const V,		// node: [C]
+	const T* const W,		// node: [W]
+	const wstGrid3d< T >& grid)
+	// ______   ___   __   _       _   __       _   _   _
+	// v'v'w' = VVW - VV * W - 2 * V * VW + 2 * V * V * W
+	//                ___			  _   _   _
+	// calculation of VVW product and V * V * W:
+	//	~~~~~1z   __1y
+	//  (V * V) * W
+	//
+	// [V2W, V2, VW, VW_adv, W] averages have to be known at all [W] nodes, including walls
+	// [V] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		flux[k] =
+			V2W[k] - V2[k] * W[k] - (V[k] + V[k - 1]) * VW[k] +
+			(T) 2.0 * (V[k] * V[k - 1]) * W[k] +
+
+			// correction term:
+			VW_adv[k] * (V[k] - V[k - 1]) * grid.dzh[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::w2w_flux(
+	T* flux,				// node: [C]
+
+	const T* const W3,		// node: [C]
+	const T* const W2_c,	// node: [C]
+	const T* const W2_w,	// node: [W]
+	const T* const W,		// node: [W]
+	const wstGrid3d< T >& grid)
+	// ______   ___       _   __       _   _   _
+	// w'w'w' = WWW - 3 * W * WW + 2 * W * W * W
+	//                ___     _   _   _
+	// calculation of WWW and W * W * W product:
+	//	~~~~~1z   __1z
+	//  (W * W) * W
+	//
+	// [W2_w, W] averages have to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		flux[k] =
+			W3[k] - (W[k] + W[k + 1]) * W2_c[k] -
+			(T) 0.5 * (W[k] * W2_w[k + 1] + W[k + 1] * W2_w[k]) +
+
+			W[k] * W[k + 1] * (W[k] + W[k + 1]);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+// [C^2] * W - fluxes
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::c2w_flux(
+	T* flux,				// node: [W]
+
+	const T* const C2W,		// node: [W]
+	const T* const C2,		// node: [W]
+	const T* const CW,		// node: [W]
+	const T* const CW_adv,	// node: [W]
+	const T* const C,		// node: [C]
+	const T* const W,		// node: [W]
+	const wstGrid3d< T >& grid)
+	// ______   ___   __   _       _   __       _   _   _
+	// c'c'w' = CCW - CC * W - 2 * C * CW + 2 * C * C * W
+	//                ___			  _   _   _
+	// calculation of CCW product and C * C * W:
+	//	~~~~~1z   
+	//  (C * C) * W
+	//
+	// [C2W, C2, CW, CW_adv, W] averages have to be known at all [W] nodes, including walls
+	// [C] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		flux[k] =
+			C2W[k] - C2[k] * W[k] - (C[k] + C[k - 1]) * CW[k] +
+			(T) 2.0 * (C[k] * C[k - 1]) * W[k] +
+
+			// correction term:
+			CW_adv[k] * (C[k] - C[k - 1]) * grid.dzh[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+// [UV, UW, VW] * W - fluxes
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::uvw_flux(
+	T* flux,				// node: [W]
+
+	const T* const UVW,		// node: [W]
+	const T* const UW,		// node: [W]
+	const T* const VW,		// node: [W]
+	const T* const UV_uvw,	// node: [W]
+	const T* const UW_uvw,	// node: [W]
+	const T* const VW_uvw,	// node: [W]
+	const T* const U,		// node: [C]
+	const T* const V,		// node: [C]
+	const T* const W,		// node: [W]
+	const wstGrid3d< T >& grid)
+	// ______   ___   __   _   __   _   __   _       _   _   _
+	// u'v'w' = UVW - UW * V - VW * U - UV * W + 2 * U * V * W 
+	//
+	// [UW, VW, UV, W] averages have to be known at all [W] nodes, including walls
+	// [U, V] averages have to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+
+		flux[k] = UVW[k] -
+			(T)0.25 * (UW[k] + UW_uvw[k]) * (V[k] + V[k - 1]) -
+			(T)0.25 * (VW[k] + VW_uvw[k]) * (U[k] + U[k - 1]) -
+			UV_uvw[k] * W[k] +
+
+			(T)0.5 * W[k] * (U[k] + U[k - 1]) * (V[k] + V[k - 1]);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::uww_flux(
+	T* flux,						// node: [C]
+
+	const T* const UWW,				// node: [C]
+	const T* const W2_w,			// node: [W]
+	const T* const W2_c,			// node: [C]
+	const T* const W2_u,			// node: [C]
+	const T* const W2_uw,			// node: [W]
+	const T* const UW,				// node: [W]
+	const T* const UW_bottom_uw,	// node: [W (C -- W)]
+	const T* const UW_top_uw,		// node: [W (C -- W)]
+	const T* const U,				// node: [C]
+	const T* const W,				// node: [W]
+	const wstGrid3d< T >& grid)
+	// ______   ___   __   _   _   __   __   _       _   _   _
+	// u'w'w' = UWW - WW * U - W * UW - UW * W + 2 * U * W * W
+	//
+	// [W^2 (at -UW and -W nodes), W, UW] averages have to be known at all [W] nodes, including walls
+	// [U] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+	// [UW-top] average has to be known at (gcz, nz - gcz - 1) nodes
+	// [UW-bottom] average has to be known at (gcz + 1, nz - gcz) nodes
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+
+		flux[k] = UWW[k] -
+
+			// __   _
+			// WW * U
+			(T)0.5 * (
+
+			(T)0.25 * (U[k + 1] + (T)2.0 * U[k] + U[k - 1]) *
+			(T)0.25 * (W2_w[k] + (T)2.0 * W2_c[k] + W2_w[k + 1])
+			+
+			(T)0.5 *
+			(T)0.25 * (W2_uw[k + 1] + W2_u[k]) * (U[k] + U[k + 1])
+			+
+			(T)0.5 *
+			(T)0.25 * (W2_uw[k] + W2_u[k]) * (U[k] + U[k - 1])
+
+			)
+
+			-
+
+			//   _   __   __   _
+			// - W * UW - UW * W
+			(
+			(T)0.25 * (W[k] + W[k + 1]) * (UW_bottom_uw[k + 1] + UW_top_uw[k]) +
+			(T)0.25 * (W[k + 1] * UW_bottom_uw[k + 1] + W[k] * UW_top_uw[k]) +
+			(T)0.125 * (W[k] + W[k + 1]) * (UW[k] + UW[k + 1])
+			)
+
+			+
+
+			//     _   _   _
+			// 2 * W * W * U
+			(T)0.5 * (W[k] + W[k + 1]) *
+			(
+			(T)0.125 * (W[k] + W[k + 1]) * (U[k + 1] + (T)2.0 * U[k] + U[k - 1]) +
+			(T)0.25 * W[k + 1] * (U[k] + U[k + 1]) + (T)0.25 * W[k] * (U[k] + U[k - 1])
+			);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::vww_flux(
+	T* flux,						// node: [C]
+
+	const T* const VWW,				// node: [C]
+	const T* const W2_w,			// node: [W]
+	const T* const W2_c,			// node: [C]
+	const T* const W2_v,			// node: [C]
+	const T* const W2_vw,			// node: [W]
+	const T* const VW,				// node: [W]
+	const T* const VW_bottom_vw,	// node: [W (C -- W)]
+	const T* const VW_top_vw,		// node: [W (C -- W)]
+	const T* const V,				// node: [C]
+	const T* const W,				// node: [W]
+	const wstGrid3d< T >& grid)
+	// ______   ___   __   _   _   __   __   _       _   _   _
+	// v'w'w' = VWW - WW * V - W * VW - VW * W + 2 * V * W * W
+	//
+	// [W^2 (at -VW and -W nodes), W, VW] averages have to be known at all [W] nodes, including walls
+	// [V] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+	// [VW-top] average has to be known at (gcz, nz - gcz - 1) nodes
+	// [VW-bottom] average has to be known at (gcz + 1, nz - gcz) nodes
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+
+		flux[k] = VWW[k] -
+
+			// __   _
+			// WW * V
+			(T)0.5 * (
+
+			(T)0.25 * (V[k + 1] + (T)2.0 * V[k] + V[k - 1]) *
+			(T)0.25 * (W2_w[k] + (T)2.0 * W2_c[k] + W2_w[k + 1])
+			+
+			(T)0.5 *
+			(T)0.25 * (W2_vw[k + 1] + W2_v[k]) * (V[k] + V[k + 1])
+			+
+			(T)0.5 *
+			(T)0.25 * (W2_vw[k] + W2_v[k]) * (V[k] + V[k - 1])
+
+			)
+
+			-
+
+			//   _   __   __   _
+			// - W * VW - VW * W
+			(
+			(T)0.25 * (W[k] + W[k + 1]) * (VW_bottom_vw[k + 1] + VW_top_vw[k]) +
+			(T)0.25 * (W[k + 1] * VW_bottom_vw[k + 1] + W[k] * VW_top_vw[k]) +
+			(T)0.125 * (W[k] + W[k + 1]) * (VW[k] + VW[k + 1])
+			)
+
+			+
+
+			//     _   _   _
+			// 2 * W * W * V
+			(T)0.5 * (W[k] + W[k + 1]) *
+			(
+			(T)0.125 * (W[k] + W[k + 1]) * (V[k + 1] + (T)2.0 * V[k] + V[k - 1]) +
+			(T)0.25 * W[k + 1] * (V[k] + V[k + 1]) + (T)0.25 * W[k] * (V[k] + V[k - 1])
+			);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+// [CU, CV, CW] * W - fluxes
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::cuw_flux(
+	T* flux,						// node: [W]
+
+	const T* const CUW,				// node: [W]
+	const T* const UW,				// node: [W]
+	const T* const CU_uw,			// node: [W]
+	const T* const CW,				// node: [W]
+	const T* const CW_uw,			// node: [W]
+	const T* const C,				// node: [C]
+	const T* const U,				// node: [C]
+	const T* const W,				// node: [W]
+	const wstGrid3d< T >& grid)
+	// ______   ___   __   _   _   __   _   __       _   _   _
+	// c'u'w' = CUW - CU * W - C * UW - U * CW + 2 * C * U * W
+	//
+	// [CUW, UW, CW, CU, W] averages have to be known at all [W] nodes, including walls
+	// [U, C] averages have to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {
+
+		flux[k] = CUW[k] - (T)0.5 * UW[k] * (C[k] + C[k - 1])
+			- (T)0.25 * (CW_uw[k] + CW[k]) * (U[k] + U[k - 1])
+			- CU_uw[k] * W[k] +
+
+			(T)2.0 * (T)0.25 * (U[k] + U[k - 1]) * (C[k] + C[k - 1]) * W[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::cvw_flux(
+	T* flux,						// node: [W]
+
+	const T* const CVW,				// node: [W]
+	const T* const VW,				// node: [W]
+	const T* const CV_vw,			// node: [W]
+	const T* const CW,				// node: [W]
+	const T* const CW_vw,			// node: [W]
+	const T* const C,				// node: [C]
+	const T* const V,				// node: [C]
+	const T* const W,				// node: [W]
+	const wstGrid3d< T >& grid)
+	// ______   ___   __   _   _   __   _   __       _   _   _
+	// c'v'w' = CVW - CV * W - C * VW - V * CW + 2 * C * V * W
+	//
+	// [CVW, VW, CW, CV, W] averages have to be known at all [W] nodes, including walls
+	// [V, C] averages have to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {
+
+		flux[k] = CVW[k] - (T)0.5 * VW[k] * (C[k] + C[k - 1])
+			- (T)0.25 * (CW_vw[k] + CW[k]) * (V[k] + V[k - 1])
+			- CV_vw[k] * W[k] +
+
+			(T)2.0 * (T)0.25 * (V[k] + V[k - 1]) * (C[k] + C[k - 1]) * W[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::cww_flux(
+	T* flux,						// node: [C]
+
+	const T* const CWW,				// node: [C]
+	const T* const W2_w,			// node: [W]
+	const T* const W2_c,			// node: [C]
+	const T* const CW,				// node: [W]
+	const T* const CW_bottom_w,		// node: [W (C -- W)]
+	const T* const CW_top_w,		// node: [W (C -- W)]
+	const T* const C,				// node: [C]
+	const T* const W,				// node: [W]
+	const wstGrid3d< T >& grid)
+	// ______   ___   __   _   _   __   __   _       _   _   _
+	// c'w'w' = CWW - WW * C - W * CW - WC * W + 2 * C * W * W
+	//
+	// [W^2, CW, W] averages have to be known at all [W] nodes, including walls
+	// [C] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+	// [CW-top] average has to be known at (gcz, nz - gcz - 1) nodes
+	// [CW-bottom] average has to be known at (gcz + 1, nz - gcz) nodes
+{
+	int k;
+#pragma omp parallel for private(k) shared(flux)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		flux[k] = CWW[k] 
+			
+			// __   _
+			// WW * C
+			- (T)0.5 * (
+			(T)0.25 * (T)0.25 * (W2_w[k] + W2_w[k + 1] + (T)2.0 * W2_c[k]) * (C[k + 1] + (T)2.0 * C[k] + C[k - 1]) +
+			(T)0.25 * (T)0.5 * (W2_w[k] + W2_c[k]) * (C[k] + C[k - 1]) +
+			(T)0.25 * (T)0.5 * (W2_w[k + 1] + W2_c[k]) * (C[k] + C[k + 1]))
+
+			// _   __
+			// W * CW
+			- (T)0.125 * (W[k] + W[k + 1]) * (CW_bottom_w[k + 1] + CW_top_w[k] + CW[k] + CW[k + 1])
+			- (T)0.25 * (
+			(T)0.5 * (W[k] + W[k + 1]) * (CW_bottom_w[k + 1] + CW_top_w[k]) + 
+			W[k] * CW_top_w[k] + W[k + 1] * CW_bottom_w[k + 1])
+			
+			//     _   _   _
+			// 2 * W * W * C
+			+ (T)0.5 * (W[k] + W[k + 1]) * (
+			(T)0.5 * (T)0.25 * (W[k] + W[k + 1]) * (C[k + 1] + (T)2.0 * C[k] + C[k - 1]) +
+			(T)0.25 * W[k] * (C[k] + C[k - 1]) + (T)0.25 * W[k + 1] * (C[k] + C[k + 1]));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+// c - pressure gradient covariances
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::c_w_pressure_gradient_turb(
+	T* c_dpdz_turb,				// node: [W]
+
+	const T* const C_dPdz,		// node: [W]
+	const T* const C,			// node: [C]
+	const T* const Pressure,	// node: [C]
+	const wstGrid3d< T >& grid)
+	// C-pressure gradient covariance
+	// __________
+	//       dp'
+	// c' * ----
+	//       dz
+	//
+	// [C, P] averages have be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(c_dpdz_turb)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		c_dpdz_turb[k] = C_dPdz[k] -
+			(C[k] + C[k - 1]) * (Pressure[k] - Pressure[k - 1]) * grid.dzmi[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/bl-scale-def.h b/bl-scale-def.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5da461528b6aea26102da0cad2ee7fafe934e0d
--- /dev/null
+++ b/bl-scale-def.h
@@ -0,0 +1,875 @@
+#pragma once
+
+// [bl-scale-def.h]: boundary-layer scales and dimensionless values definitions
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "wstgrid3d.h"
+
+
+namespace nse
+{
+	// Time scales
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void time_scale_turbulent(T* _time_scale_turbulent,		// node: [C]
+
+		const T* const TKE,						// node: [C]
+		const T* const TKE_iso_dissipation,		// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void time_scale_svariance(T* _time_scale_svariance,		// node: [C]
+
+		const T* const C2,						// node: [C]
+		const T* const C,						// node: [C]
+		const T* const CVA_iso_dissipation,		// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// Length scales
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void length_scale_kolmogorov(T* _length_scale_kolmogorov,	// node: [C]
+
+		const T* const TKE_iso_dissipation,		// node: [C]
+		const T c_kinematic_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void length_scale_mixing(T* _length_scale_mixing,			// node: [W]
+
+		const T* const U2,						// node: [W]
+		const T* const V2,						// node: [W]
+		const T* const W2,						// node: [W]
+		const T* const U,						// node: [C]
+		const T* const V,						// node: [C]
+		const T* const W,						// node: [W]
+		const T* const U_grad,					// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void length_scale_ellison(T* _length_scale_ellison,			// node: [W]
+
+		const T* const T2,						// node: [W]
+		const T* const Tc,						// node: [C]
+		const T* const T_grad,					// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void length_scale_ozmidov(T* _length_scale_ozmidov,			// node: [C]
+
+		const T* const TKE_iso_dissipation,		// node: [C]
+		const T* const T_grad,					// node: [W]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void length_scale_obukhov(T* _length_scale_obukhov,			// node: [W]
+
+		const T* const uw_flux,					// node: [W]
+		const T* const Tw_flux,					// node: [W]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// Dimensionless numbers
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void prandtl_turbulent(T* Prandtl_turbulent,		// node: [W]
+
+		const T* const uw_flux,		// node: [W]
+		const T* const U_grad,		// node: [W]
+		const T* const Tw_flux,		// node: [W]
+		const T* const T_grad,		// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void richardson_gradient(T* Richardson_gradient,	// node: [W]
+
+		const T* const U_grad,		// node: [W]
+		const T* const T_grad,		// node: [W]
+		const T c_Richardson,
+		const nse_const3d::axisType axis, const wstGrid3d< T >& grid);	// [axisZ || axisYZ]
+
+	template< typename T >
+	void richardson_flux(T* Richardson_flux,			// node: [W]
+
+		const T* const uw_flux,		// node: [W]
+		const T* const U_grad,		// node: [W]
+		const T* const Tw_flux,		// node: [W]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void reynolds_buoyancy(T* Reynolds_buoyancy,		// node: [C]
+
+		const T* const TKE_iso_dissipation,				// node: [C]
+		const T* const T_grad,							// node: [W]
+		const T c_Richardson, const T c_kinematic_viscosity,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void froude_horizontal(T* Froude_horizontal,		// node: [C]
+
+		const T* const u_TKE,							// node: [C]
+		const T* const v_TKE,							// node: [C]
+		const T* const TKE_iso_dissipation,				// node: [C]
+		const T* const T_grad,							// node: [W]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void mixing_efficiency(T* _mixing_efficiency,		// node: [C]
+
+		const T* const TKE_iso_dissipation,				// node: [C]
+		const T* const TVA_iso_dissipation,				// node: [C]
+		const T* const T_grad,							// node: [W]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void turbulence_production_ratio(T* turb_production_ratio,	// node: [C]
+
+		const T* const TKE_production,	// node: [C]
+		const T* const TVA_production,	// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+
+	// pressure-strain models
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void Rotta_model(T* u_Rotta, T* v_Rotta, T* w_Rotta,	// node: [C]
+		T* uw_Rotta,										// node: [C]
+
+		const T* const TKE, const T* const TKE_iso_dissipation,					// node: [C]
+		const T* const u_TKE, const T* const v_TKE, const T* const w_TKE,		// node: [C]
+		const T* const uw_flux,													// node: [W]
+
+		const T* const u_TKE_exchange,	// node: [C]
+		const T* const v_TKE_exchange,	// node: [C] 
+		const T* const w_TKE_exchange,	// node: [C]
+		const T* const P2Suw_turb_c,	// node: [�]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void RDT_model(T* u_RDT, T* v_RDT, T* w_RDT,			// node: [C]
+		T* uw_RDT,											// node: [C]
+
+		const T* const TKE_production,		// node: [C]
+		const T* const W2_w,				// node: [W]
+		const T* const U,					// node: [C]
+		const T* const W,					// node: [W]
+
+		const T* const u_TKE_exchange,	// node: [C]
+		const T* const v_TKE_exchange,	// node: [C] 
+		const T* const w_TKE_exchange,	// node: [C]
+		const T* const P2Suw_turb_c,	// node: [�]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void Rotta_RDT_model(T* Rotta_RDT_e, T* Rotta_RDT_p,	// node: [C]
+
+		const T* const TKE,					// node: [C]
+		const T* const TKE_iso_dissipation,	// node: [C]
+		const T* const TKE_production,		// node: [C]
+		const T* const u_TKE, const T* const v_TKE, const T* const w_TKE,		// node: [C]
+		
+		const T* const u_TKE_exchange,	// node: [C]
+		const T* const v_TKE_exchange,	// node: [C] 
+		const T* const w_TKE_exchange,	// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void Rotta_buoyancy_model(T* Rotta_buoyancy_e, T* Rotta_buoyancy_b,		// node: [C]
+
+		const T* const TKE,					// node: [C]
+		const T* const TKE_iso_dissipation,	// node: [C]
+		const T* const TKE_heat_flux,		// node: [C]
+		const T* const u_TKE, const T* const v_TKE, const T* const w_TKE,		// node: [C]
+
+		const T* const u_TKE_exchange,	// node: [C]
+		const T* const v_TKE_exchange,	// node: [C] 
+		const T* const w_TKE_exchange,	// node: [C]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void RDT_buoyancy_model(T* RDT_buoyancy_p, T* RDT_buoyancy_b,			// node: [C]
+
+		const T* const TKE_production,		// node: [C]
+		const T* const TKE_heat_flux,		// node: [C]
+
+		const T* const u_TKE_exchange,	// node: [C]
+		const T* const v_TKE_exchange,	// node: [C] 
+		const T* const w_TKE_exchange,	// node: [C]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void Rotta_TPE_model(T* u_Rotta_TPE, T* v_Rotta_TPE, T* w_Rotta_TPE,	// node: [C]
+
+		const T* const TKE,					// node: [C]
+		const T* const TPE,					// node: [C]
+		const T* const TKE_iso_dissipation,	// node: [C]
+		const T* const TPE_iso_dissipation,	// node: [C]
+		const T* const u_TKE, const T* const v_TKE, const T* const w_TKE,		// node: [C]
+
+		const T* const u_TKE_exchange,	// node: [C]
+		const T* const v_TKE_exchange,	// node: [C] 
+		const T* const w_TKE_exchange,	// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+}
+
+// Time scales
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::time_scale_turbulent(
+	T* _time_scale_turbulent,				// node: [C]
+
+	const T* const TKE,						// node: [C]
+	const T* const TKE_iso_dissipation,		// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// T = (1/2) * [(u')^2 + (v'2)^2 + (w')^2] / ([dui'/dxj]*[dui'/dxj])
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(_time_scale_turbulent)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_time_scale_turbulent[k] = TKE[k] / (-TKE_iso_dissipation[k]);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::time_scale_svariance(
+	T* _time_scale_svariance,				// node: [C]
+
+	const T* const C2,						// node: [C]
+	const T* const C,						// node: [C]
+	const T* const CVA_iso_dissipation,		// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// T = (1/2)*[c'^2] / ([dc'/dxj] * [dc'/dxj])
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(_time_scale_svariance)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_time_scale_svariance[k] = ((T)0.5*(
+			C2[k] - C[k] * C[k])) / (-CVA_iso_dissipation[k]);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// Length scales
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::length_scale_kolmogorov(
+	T* _length_scale_kolmogorov,			// node: [C]
+
+	const T* const TKE_iso_dissipation,		// node: [C]
+	const T c_kinematic_viscosity, const wstGrid3d< T >& grid)
+	//
+	// L = [ nu^3 / e(ke) ]^(1/4), e - isotropic dissipation
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(_length_scale_kolmogorov)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_length_scale_kolmogorov[k] = pow(
+			pow(c_kinematic_viscosity, (T)3.0) / (-TKE_iso_dissipation[k]),
+			(T)0.25);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::length_scale_mixing(
+	T* _length_scale_mixing,			// node: [W]
+
+	const T* const U2,					// node: [W]
+	const T* const V2,					// node: [W]
+	const T* const W2,					// node: [W]
+	const T* const U,					// node: [C]
+	const T* const V,					// node: [C]
+	const T* const W,					// node: [W]
+	const T* const U_grad,				// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// L = sqrt[(u')^2 + (v'2)^2 + (w')^2] / [dU/dz]
+	//
+	// [U^2, V^2, W^2, W, dU/dz] averages have to be known at all [W] nodes, including walls
+	// [U, V] averages have to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_length_scale_mixing)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_length_scale_mixing[k] = sqrt(
+			(U2[k] - U[k] * U[k - 1]) +
+			(V2[k] - V[k] * V[k - 1]) +
+			(W2[k] - W[k] * W[k])
+		) / U_grad[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::length_scale_ellison(
+	T* _length_scale_ellison,	// node: [W]
+
+	const T* const T2,			// node: [W]
+	const T* const Tc,			// node: [C]
+	const T* const T_grad,		// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// L = sqrt[T'^2] / [dT/dz]
+	//
+	// [T^2, dT/dz] averages have to be known at all [W] nodes, including walls
+	// [T] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_length_scale_ellison)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_length_scale_ellison[k] = sqrt(
+			T2[k] - Tc[k] * Tc[k - 1]) / T_grad[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::length_scale_ozmidov(
+	T* _length_scale_ozmidov,				// node: [C]
+
+	const T* const TKE_iso_dissipation,		// node: [C]
+	const T* const T_grad,					// node: [W]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	//
+	// L = ( e(TKE) / [Ri(b)*dT/dz]^3/2 )^1/2
+	//
+	// [dT/dz] average has to be known at all [W] nodes, including walls
+{
+	int k;
+	if (fabs(c_Richardson) > 0) {
+
+#pragma omp parallel for private(k) shared(_length_scale_ozmidov)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			_length_scale_ozmidov[k] = sqrt(
+				-TKE_iso_dissipation[k] /
+				((T)0.5 * c_Richardson * (T_grad[k] + T_grad[k + 1]) *
+				sqrt((T)0.5 * c_Richardson * (T_grad[k] + T_grad[k + 1]))));
+		}
+	}
+	else
+	{
+
+#pragma omp parallel for private(k) shared(_length_scale_ozmidov)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			_length_scale_ozmidov[k] = (T) 0.0;
+		}
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::length_scale_obukhov(
+	T* _length_scale_obukhov,	// node: [W]
+
+	const T* const uw_flux,		// node: [W]
+	const T* const Tw_flux,		// node: [W]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	//
+	// L = (u'w' * sqrt(|u'w'|)) / (Ri(b) * T'w')
+	//
+	// [u'w', T'w'] have to be known at all [W] nodes, excluding walls
+{
+	// define -k shifts to prevent division by zero //
+	const int kbsh = (grid.mpi_com.rank_z == 0) ? 1 : 0;
+	const int ktsh = (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1) ? 1 : 0;
+
+	int k;
+	if (fabs(c_Richardson) > 0) {
+#pragma omp parallel for private(k) shared(_length_scale_obukhov)
+		for (k = grid.gcz + kbsh; k <= grid.nz - grid.gcz - ktsh; k++) {	// all [W] nodes, excluding walls
+			_length_scale_obukhov[k] = (uw_flux[k] * sqrt(fabs(uw_flux[k]))) / (c_Richardson * Tw_flux[k]);
+		}
+	}
+	else
+	{
+#pragma omp parallel for private(k) shared(_length_scale_obukhov)
+		for (k = grid.gcz + kbsh; k <= grid.nz - grid.gcz - ktsh; k++) {	// all [W] nodes, excluding walls
+			_length_scale_obukhov[k] = (T)0;
+		}
+	}
+
+	if (kbsh)	// bottom b.c.
+		_length_scale_obukhov[grid.gcz] = _length_scale_obukhov[grid.gcz + 1];
+	if (ktsh)	// top b.c.
+		_length_scale_obukhov[grid.nz - grid.gcz] = _length_scale_obukhov[grid.nz - grid.gcz - 1];
+}
+// -------------------------------------------------------------------------------------------- //
+
+// Dimensionless numbers
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::prandtl_turbulent(
+	T* Prandtl_turbulent,		// node: [W]
+
+	const T* const uw_flux,		// node: [W]
+	const T* const U_grad,		// node: [W]
+	const T* const Tw_flux,		// node: [W]
+	const T* const T_grad,		// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// Pr-turbulent = (u'w' * dT/dz) / (T'w' * dU/dz)
+	//
+	// [u'w', dU/dz, T'w', dT/dz] have to be known at all [W] nodes, excluding walls
+{
+	// define -k shifts to prevent division by zero //
+	const int kbsh = (grid.mpi_com.rank_z == 0) ? 1 : 0;
+	const int ktsh = (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1) ? 1 : 0;
+
+	int k;
+#pragma omp parallel for private(k) shared(Prandtl_turbulent)
+	for (k = grid.gcz + kbsh; k <= grid.nz - grid.gcz - ktsh; k++) {	// all [W] nodes, excluding walls
+		Prandtl_turbulent[k] = (uw_flux[k] * T_grad[k]) / (Tw_flux[k] * U_grad[k]);
+	}
+
+	if (kbsh)	// bottom b.c.
+		Prandtl_turbulent[grid.gcz] = Prandtl_turbulent[grid.gcz + 1];
+	if (ktsh)	// top b.c.
+		Prandtl_turbulent[grid.nz - grid.gcz] = Prandtl_turbulent[grid.nz - grid.gcz - 1];
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::richardson_gradient(
+	T* Richardson_gradient,	// node: [W]
+
+	const T* const U_grad,	// node: [W]
+	const T* const T_grad,	// node: [W]
+
+	const T c_Richardson, 
+	const nse_const3d::axisType axis, const wstGrid3d< T >& grid)
+	//
+	// Ri-gradient = Ri(b) * [(dT/dz) / (dU/dz)^2]
+	//
+	// [dU/dz, dT/dz] have to be known at all [W] nodes, including walls
+{
+	if (axis == nse_const3d::axisZ) {
+
+		int k;
+#pragma omp parallel for private(k) shared(Richardson_gradient)
+		for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) { // all [W] nodes
+			Richardson_gradient[k] = c_Richardson * (T_grad[k] / (U_grad[k] * U_grad[k]));
+		}
+		return;
+	}
+	if (axis == nse_const3d::axisYZ) {
+
+		int j, k, idx;
+#pragma omp parallel for private(j, k, idx) shared(Richardson_gradient)
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) { // all [W] nodes
+				idx = j * grid.nz + k;
+				Richardson_gradient[idx] = c_Richardson * (T_grad[idx] / (U_grad[idx] * U_grad[idx]));
+			}
+		return;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::richardson_flux(
+	T* Richardson_flux,		// node: [W]
+
+	const T* const uw_flux,	// node: [W]
+	const T* const U_grad,	// node: [W]
+	const T* const Tw_flux,	// node: [W]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	//
+	// Ri-flux = Ri(b) * [T'w' / (u'w' * dU/dz)]
+	//
+	// [u'w', dU/dz, T'w'] have to be known at all [W] nodes, excluding walls
+{
+	// define -k shifts to prevent division by zero //
+	const int kbsh = (grid.mpi_com.rank_z == 0) ? 1 : 0;
+	const int ktsh = (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1) ? 1 : 0;
+
+	int k;
+#pragma omp parallel for private(k) shared(Richardson_flux)
+	for (k = grid.gcz + kbsh; k <= grid.nz - grid.gcz - ktsh; k++) {	// all [W] node, excluding walls
+		Richardson_flux[k] = c_Richardson * (Tw_flux[k] / (uw_flux[k] * U_grad[k]));
+	}
+
+	if (kbsh)	// bottom b.c.
+		Richardson_flux[grid.gcz] = Richardson_flux[grid.gcz + 1];
+	if (ktsh)	// top b.c.
+		Richardson_flux[grid.nz - grid.gcz] = Richardson_flux[grid.nz - grid.gcz - 1];
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::reynolds_buoyancy(
+	T* Reynolds_buoyancy,				// node: [C]
+
+	const T* const TKE_iso_dissipation,	// node: [C]
+	const T* const T_grad,				// node: [W]
+	const T c_Richardson, const T c_kinematic_viscosity,
+	const wstGrid3d< T >& grid)
+	//
+	// Re-buoyancy = e(TKE) / (nu * N^2) = e(TKE) / (nu*Ri(b)*dT/dz)
+	//
+	// [dT/dz] has to be known at all [W] nodes, including walls
+{
+	int k;
+	if (fabs(c_Richardson) > 0) {
+
+#pragma omp parallel for private(k) shared(Reynolds_buoyancy)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			Reynolds_buoyancy[k] = (-TKE_iso_dissipation[k]) /
+				(c_kinematic_viscosity * c_Richardson * (T)0.5 * (T_grad[k] + T_grad[k + 1]));
+		}
+	}
+	else
+	{
+
+#pragma omp parallel for private(k) shared(Reynolds_buoyancy)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			Reynolds_buoyancy[k] = (T) 0.0;
+		}
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::froude_horizontal(
+	T* Froude_horizontal,				// node: [C]
+
+	const T* const u_TKE,				// node: [C]
+	const T* const v_TKE,				// node: [C]
+	const T* const TKE_iso_dissipation,	// node: [C]
+	const T* const T_grad,				// node: [W]
+	const T c_Richardson,
+	const wstGrid3d< T >& grid)
+	//
+	// Fr-horizontal = (1/([Ri(b)*dT/dz]^1/2))*[0.5*e(TKE)/(E(u)+E(v))]
+	//
+	// [dT/dz] has to be known at all [W] nodes, including walls
+{
+	int k;
+	if (fabs(c_Richardson) > 0) {
+
+#pragma omp parallel for private(k) shared(Froude_horizontal)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			Froude_horizontal[k] =
+				((T)1.0 / sqrt(c_Richardson * (T)0.5 * (T_grad[k] + T_grad[k + 1]))) *
+				((-(T)0.5 * TKE_iso_dissipation[k]) / (u_TKE[k] + v_TKE[k]));
+		}
+	}
+	else
+	{
+
+#pragma omp parallel for private(k) shared(Froude_horizontal)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			Froude_horizontal[k] = (T) 0.0;
+		}
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::mixing_efficiency(
+	T* _mixing_efficiency,				// node: [C]
+
+	const T* const TKE_iso_dissipation,	// node: [C]
+	const T* const TVA_iso_dissipation,	// node: [C]
+	const T* const T_grad,				// node: [W]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	//
+	// e(TPE) / e(TKE) = Ri(b) * [e(TTE) / (e(TKE)*(dT/dz))]
+	//
+	// [dT/dz] has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_mixing_efficiency)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_mixing_efficiency[k] = c_Richardson *
+			(TVA_iso_dissipation[k] / ((T)0.5 * TKE_iso_dissipation[k] * (T_grad[k] + T_grad[k + 1])));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::turbulence_production_ratio(
+	T* turb_production_ratio,		// node: [C]
+
+	const T* const TKE_production,	// node: [C]
+	const T* const TVA_production,	// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// P(TKE) / P(TVA)
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(turb_production_ratio)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		turb_production_ratio[k] = TKE_production[k] / TVA_production[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// pressure-strain models
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::Rotta_model(
+	T* u_Rotta, T* v_Rotta, T* w_Rotta,		// node: [C]
+	T* uw_Rotta,							// node: [C]
+
+	const T* const TKE,														// node: [C]
+	const T* const TKE_iso_dissipation,										// node: [C]
+	const T* const u_TKE, const T* const v_TKE, const T* const w_TKE,		// node: [C]
+	const T* const uw_flux,													// node: [W]
+	
+	const T* const u_TKE_exchange,	// node: [C]
+	const T* const v_TKE_exchange,	// node: [C] 
+	const T* const w_TKE_exchange,	// node: [C]
+	const T* const P2Suw_turb_c,	// node: [�]
+	const wstGrid3d< T >& grid)
+	//
+	// Rotta "return-to-isotropy" model constants estimation
+	// Qii = (2/3)*(C(r)/t(T))*(TKE - 3*TKE(i))
+	//
+	// [u'w'] has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(u_Rotta, v_Rotta, w_Rotta, uw_Rotta)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		u_Rotta[k] = (T)3.0 * (u_TKE_exchange[k] / (-TKE_iso_dissipation[k])) *
+			(TKE[k] / (TKE[k] - (T)3.0 * u_TKE[k]));
+		v_Rotta[k] = (T)3.0 * (v_TKE_exchange[k] / (-TKE_iso_dissipation[k])) *
+			(TKE[k] / (TKE[k] - (T)3.0 * v_TKE[k]));
+		w_Rotta[k] = (T)3.0 * (w_TKE_exchange[k] / (-TKE_iso_dissipation[k])) *
+			(TKE[k] / (TKE[k] - (T)3.0 * w_TKE[k]));
+
+		uw_Rotta[k] = (P2Suw_turb_c[k] / (-TKE_iso_dissipation[k])) *
+			(TKE[k] / (-(T)0.5 * (uw_flux[k] + uw_flux[k + 1])));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::RDT_model(T* u_RDT, T* v_RDT, T* w_RDT,	// node: [C] node
+	T* uw_RDT,										// node: [C] node
+
+	const T* const TKE_production,											// node: [C]
+	const T* const W2_w,													// node: [W]
+	const T* const U,														// node: [C]
+	const T* const W,														// node: [W]
+
+	const T* const u_TKE_exchange,	// node: [C]
+	const T* const v_TKE_exchange,	// node: [C] 
+	const T* const w_TKE_exchange,	// node: [C]
+	const T* const P2Suw_turb_c,	// node: [�]
+	const wstGrid3d< T >& grid)
+	//
+	// RDT "return-to-isotropy" model constants estimation
+	// Qii = - (2/3)*C(p)*((3/2)*Pi - P)
+	//
+	// [W, W'^2] have to be known at all [W] nodes, including walls
+	// [U] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	T Guw;
+	int k;
+#pragma omp parallel for private(k, Guw) shared(u_RDT, v_RDT, w_RDT, uw_RDT)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		u_RDT[k] = -(T)3.0 * (u_TKE_exchange[k] / ((T) 2.0 * TKE_production[k]));
+		v_RDT[k] = (T)3.0 * (v_TKE_exchange[k] / (TKE_production[k]));
+		w_RDT[k] = (T)3.0 * (w_TKE_exchange[k] / (TKE_production[k]));
+
+		Guw =
+			(W2_w[k] - W[k] * W[k]) *
+			(U[k] - U[k - 1]) * grid.dzmi[k] +
+
+			(W2_w[k + 1] - W[k + 1] * W[k + 1]) *
+			(U[k + 1] - U[k]) * grid.dzmi[k + 1];
+
+		uw_RDT[k] = P2Suw_turb_c[k] / Guw;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::Rotta_RDT_model(T* Rotta_RDT_e, T* Rotta_RDT_p,	// node: [C]
+
+	const T* const TKE,														// node: [C]
+	const T* const TKE_iso_dissipation,										// node: [C]
+	const T* const TKE_production,											// node: [C]
+	const T* const u_TKE, const T* const v_TKE, const T* const w_TKE,		// node: [C]
+	
+	const T* const u_TKE_exchange,	// node: [C]
+	const T* const v_TKE_exchange,	// node: [C] 
+	const T* const w_TKE_exchange,	// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// Rotta-RDT "return-to-isotropy" model constants estimation
+	// Qii = (2/3)*(C(r)/t(T))*(TKE - 3*TKE(i)) - (2/3)*C(p)*((3/2)*Pi - P)
+	//  (*) 2 equations suffice for constant determination
+	//
+{
+	T Eu, Ev, Gu, Gv, Qu, Qv;
+	int k;
+#pragma omp parallel for private(k, Eu, Ev, Gu, Gv, Qu, Qv) \
+	shared(Rotta_RDT_e, Rotta_RDT_p)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+
+		// using Quu, Qvv equations only //
+		Eu = (T) 3.0 * u_TKE[k] - TKE[k];
+		Ev = (T) 3.0 * v_TKE[k] - TKE[k];
+
+		Gu = (T) 2.0 * TKE_production[k];
+		Gv = -TKE_production[k];
+
+		Qu = u_TKE_exchange[k];
+		Qv = v_TKE_exchange[k];
+
+		Rotta_RDT_e[k] = (T) 3.0 * (TKE[k] / (-TKE_iso_dissipation[k])) *
+			((Gv * Qu - Gu * Qv) / (Ev * Gu - Eu * Gv));
+		Rotta_RDT_p[k] = (T) 3.0 *
+			((Eu * Qv - Ev * Qu) / (Ev * Gu - Eu * Gv));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::Rotta_buoyancy_model(
+	T* Rotta_buoyancy_e, T* Rotta_buoyancy_b,	// node: [C]
+
+	const T* const TKE,														// node: [C]
+	const T* const TKE_iso_dissipation,										// node: [C]
+	const T* const TKE_heat_flux,											// node: [C]
+	const T* const u_TKE, const T* const v_TKE, const T* const w_TKE,		// node: [C]
+	
+	const T* const u_TKE_exchange,	// node: [C]
+	const T* const v_TKE_exchange,	// node: [C] 
+	const T* const w_TKE_exchange,	// node: [C]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	//
+	// Rotta-buoyancy "return-to-isotropy" model constants estimation
+	// Qii = (2/3)*(C(r)/t(T))*(TKE - 3*TKE(i)) - (2/3)*C(b)*((3/2)*Bi - B)
+	//  (*) 2 equations suffice for constant determination
+	//
+{
+	T Eu, Ew, Bu, Bw, Qu, Qw;
+	int k;
+
+	if (fabs(c_Richardson) > 0) {
+#pragma omp parallel for private(k, Eu, Ew, Bu, Bw, Qu, Qw) \
+	shared(Rotta_buoyancy_e, Rotta_buoyancy_b)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+
+			// using Quu, Qww equations only //
+			Eu = (T) 3.0 * u_TKE[k] - TKE[k];
+			Ew = (T) 3.0 * w_TKE[k] - TKE[k];
+
+			Bu = -TKE_heat_flux[k];
+			Bw = (T) 2.0 * TKE_heat_flux[k];
+
+			Qu = u_TKE_exchange[k];
+			Qw = w_TKE_exchange[k];
+
+			Rotta_buoyancy_e[k] = (T) 3.0 * (TKE[k] / (-TKE_iso_dissipation[k])) *
+				((Bw * Qu - Bu * Qw) / (Ew * Bu - Eu * Bw));
+			Rotta_buoyancy_b[k] = (T) 3.0 *
+				((Eu * Qw - Ew * Qu) / (Ew * Bu - Eu * Bw));
+		}
+	}
+	else
+	{
+#pragma omp parallel for private(k) shared(Rotta_buoyancy_e, Rotta_buoyancy_b)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			Rotta_buoyancy_e[k] = (T) 0;
+			Rotta_buoyancy_b[k] = (T) 0;
+		}
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::RDT_buoyancy_model(
+	T* RDT_buoyancy_p, T* RDT_buoyancy_b,		// node: [C]
+
+	const T* const TKE_production,											// node: [C]
+	const T* const TKE_heat_flux,											// node: [C]
+
+	const T* const u_TKE_exchange,	// node: [C]
+	const T* const v_TKE_exchange,	// node: [C] 
+	const T* const w_TKE_exchange,	// node: [C]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	//
+	// RDT-buoyancy "return-to-isotropy" model constants estimation
+	// Qii = - (2/3)*C(p)*((3/2)*Pi - P) - (2/3)*C(b)*((3/2)*Bi - B)
+	//  (*) 2 equations suffice for constant determination
+	//
+{
+	T Gu, Gw, Bu, Bw, Qu, Qw;
+	int k;
+
+	if (fabs(c_Richardson) > 0) {
+#pragma omp parallel for private(k, Gu, Gw, Bu, Bw, Qu, Qw) \
+	shared(RDT_buoyancy_p, RDT_buoyancy_b)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+
+			// using Quu, Qww equations only //
+			Gu = (T) 2.0 * TKE_production[k];
+			Gw = -TKE_production[k];
+
+			Bu = -TKE_heat_flux[k];
+			Bw = (T) 2.0 * TKE_heat_flux[k];
+
+			Qu = u_TKE_exchange[k];
+			Qw = w_TKE_exchange[k];
+
+			RDT_buoyancy_p[k] = (T) 3.0 *
+				((Bw * Qu - Bu * Qw) / (Gw * Bu - Gu * Bw));
+			RDT_buoyancy_b[k] = (T) 3.0 *
+				((Gu * Qw - Gw * Qu) / (Gw * Bu - Gu * Bw));
+		}
+	}
+	else
+	{
+#pragma omp parallel for private(k) shared(RDT_buoyancy_p, RDT_buoyancy_b)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			RDT_buoyancy_p[k] = (T)0;
+			RDT_buoyancy_b[k] = (T)0;
+		}
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::Rotta_TPE_model(
+	T* u_Rotta_TPE, T* v_Rotta_TPE, T* w_Rotta_TPE,		// node: [C]
+
+	const T* const TKE,						// node: [C]
+	const T* const TPE,						// node: [C]
+	const T* const TKE_iso_dissipation,		// node: [C]
+	const T* const TPE_iso_dissipation,		// node: [C]
+	const T* const u_TKE, const T* const v_TKE, const T* const w_TKE,		// node: [C]
+
+	const T* const u_TKE_exchange,	// node: [C]
+	const T* const v_TKE_exchange,	// node: [C] 
+	const T* const w_TKE_exchange,	// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// Rotta-TPE "return-to-isotropy" model constants estimation
+	// Qii = (2/3)*(C(r)/t(TKE+TPE))*((TKE+TPE) - 3*(TKE(i) + TPE[i,3]))
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(u_Rotta_TPE, v_Rotta_TPE, w_Rotta_TPE)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		u_Rotta_TPE[k] = (T)3.0 * (u_TKE_exchange[k] / (-TKE_iso_dissipation[k] - TPE_iso_dissipation[k])) *
+			((TKE[k] + TPE[k]) / ((TKE[k] + TPE[k]) - (T)3.0 * u_TKE[k]));
+		v_Rotta_TPE[k] = (T)3.0 * (v_TKE_exchange[k] / (-TKE_iso_dissipation[k] - TPE_iso_dissipation[k])) *
+			((TKE[k] + TPE[k]) / ((TKE[k] + TPE[k]) - (T)3.0 * v_TKE[k]));
+		w_Rotta_TPE[k] = (T)3.0 * (w_TKE_exchange[k] / (-TKE_iso_dissipation[k] - TPE_iso_dissipation[k])) *
+			((TKE[k] + TPE[k]) / ((TKE[k] + TPE[k]) - (T)3.0 * (w_TKE[k] + TPE[k])));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/bl-turb-scalar.h b/bl-turb-scalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cf7a0a1b01f40f032014802c3deb45b04625d1c
--- /dev/null
+++ b/bl-turb-scalar.h
@@ -0,0 +1,954 @@
+#pragma once
+
+// [bl-turb-scalar.h]: boundary-layer scalar turbulence statistics and budgets
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "wstgrid3d.h"
+
+//
+// *[Note]: we need gcz >= 2 for computation of production terms:
+//         _
+//        dC
+// - w'w' --
+//        dz
+//
+
+
+namespace nse
+{
+	// Heat eq. balance
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void heat_eq(
+		T* heat_balance,				// node: [W]
+		T* turbulent_heat_flux,			// node: [W]
+		T* heat_stress,					// node: [W]
+
+		const T* const Tw_flux,			// node: [W]
+		const T* const T_grad,			// node: [W]
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// SVA production
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void SVA_production(T* _SVA_production,		// node: [C]
+
+		const T* const CW_bottom,				// node: [C (W -- C)]
+		const T* const CW_top,					// node: [C (W -- C)]
+		const T* const C,						// node: [C]
+		const T* const W,						// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// SVA transport
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void SVA_transport(T* _SVA_transport,	// node: [C]
+
+		const T* const cc_w_flux,			// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// SVA dissipation
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void SVA_dissipation(T* _SVA_dissipation,	// node: [C]
+
+		const T* const C_dissipation,			// node: [C]
+		const T* const C,						// node: [C]
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// SVA iso dissipation
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void SVA_iso_dissipation(T* _SVA_iso_dissipation,	// node: [C]
+
+		const T* const C_iso_dissipation,				// node: [C]
+		const T* const C,								// node: [C]
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+
+	// c'ui' flux budget: production
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void cu_production_shear(T* _cu_production_shear,	// node: [C]
+
+		const T* const CW_bottom_u,		// node: [C (W -- C)]
+		const T* const CW_top_u,		// node: [C (W -- C)]
+		const T* const C,				// node: [C]
+		const T* const U,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cu_production_gradC(T* _cu_production_gradC,	// node: [C]
+
+		const T* const UW_bottom,		// node: [C (W -- C)]
+		const T* const UW_top,			// node: [C (W -- C)]
+		const T* const C,				// node: [C]
+		const T* const U,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void cv_production_shear(T* _cv_production_shear,	// node: [C]
+
+		const T* const CW_bottom_v,		// node: [C (W -- C)]
+		const T* const CW_top_v,		// node: [C (W -- C)]
+		const T* const C,				// node: [C]
+		const T* const V,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cv_production_gradC(T* _cv_production_gradC,	// node: [C]
+
+		const T* const VW_bottom,		// node: [C (W -- C)]
+		const T* const VW_top,			// node: [C (W -- C)]
+		const T* const C,				// node: [C]
+		const T* const V,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void cw_production_shear(T* _cw_production_shear,	// node: [W]
+
+		const T* const CW_bottom_w,		// node: [W (C -- W)]
+		const T* const CW_top_w,		// node: [W (C -- W)]
+		const T* const C,				// node: [W]
+		const T* const W,				// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cw_production_gradC(T* _cw_production_gradC,	// node: [W]
+
+		const T* const W2_w,			// node: [W]
+		const T* const W2_c,			// node: [C]
+		const T* const C,				// node: [C]
+		const T* const W,				// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// c'ui' flux budget: transport
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void cu_transport(T* _cu_transport,		// node: [C]
+
+		const T* const cuw_flux,			// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cv_transport(T* _cv_transport,		// node: [C]
+
+		const T* const cvw_flux,			// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cw_transport(T* _cw_transport,		// node: [W]
+
+		const T* const cww_flux,			// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// c'ui' flux budget: pressure scrambles
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void cw_pressure_work(T* _cw_pressure_work,		// node: [W]
+
+		const T* const cp_flux,						// node: [C]
+		const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void cu_pressure_gradc(T* _cu_pressure_gradc,	// node: [C]
+		// computing as: 
+		//   p'*dc'/dx = { d(c'p')/dx = 0 } - c'*dp'/dx
+		//											
+
+		const T* const c_dpdx_turb,					// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cv_pressure_gradc(T* _cv_pressure_gradc,	// node: [C]
+		// computing as: 
+		//   p'*dc'/dy = { d(c'p')/dy = 0 } - c'*dp'/dy
+		//											
+
+		const T* const c_dpdy_turb,					// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cw_pressure_gradc(T* _cw_pressure_gradc,	// node: [W]
+		// computing as: 
+		//   p'*dc'/dz = d(c'p')/dz - c'*dp'/dz
+		//
+
+		const T* const _cw_pressure_work,			// node: [W]
+		const T* const c_dpdz_turb,					// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// c'ui' flux budget: dissipation
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void cu_dissipation(T* _cu_dissipation,			// node: [C]
+
+		const T* const CU_dissipation,				// node: [C]
+		const T* const C,							// node: [C]
+		const T* const U,							// node: [C]
+		const T c_diffusivity, const T c_kinematic_viscosity,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cv_dissipation(T* _cv_dissipation,			// node: [C]
+
+		const T* const CV_dissipation,				// node: [C]
+		const T* const C,							// node: [C]
+		const T* const V,							// node: [C]
+		const T c_diffusivity, const T c_kinematic_viscosity,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cw_dissipation(T* _cw_dissipation,			// node: [W]
+
+		const T* const CW_dissipation,				// node: [W]
+		const T* const C,							// node: [C]
+		const T* const W,							// node: [W]
+		const T c_diffusivity, const T c_kinematic_viscosity,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// c'w' flux budget: buoyancy 
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void cw_buoyancy(T* _cw_buoyancy,		// node: [W]
+
+		const T* const C2_c,				// node: [C]
+		const T* const C2_w,				// node: [W]
+		const T* const C,					// node: [C]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+
+	// TPE structure
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void TPE_structure(T* TPE,							// node: [C]
+		T* TPE_heat_flux, T* TPE_diffusion,				// node: [C]
+		T* TPE_dissipation, T* TPE_iso_dissipation,		// node: [C]
+
+		const T* const TVA_production,			// node: [C]
+		const T* const TVA_diffusion,			// node: [C]
+		const T* const TVA_dissipation,			// node: [C]
+		const T* const TVA_iso_dissipation,		// node: [C]
+
+		const T* const T2,						// node: [C]
+		const T* const Tc,						// node: [C]
+		const T* const T_grad,					// node: [W]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// Energy structure
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void energy_structure(T* TKE_share, T* TPE_share,	// node: [C]
+
+		const T* const TKE,		// node: [C]
+		const T* const TPE,		// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+}
+
+
+// Heat eq. balance
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::heat_eq(
+	T* heat_balance,				// node: [W]
+	T* turbulent_heat_flux,			// node: [W]
+	T* heat_stress,					// node: [W]
+
+	const T* const Tw_flux,			// node: [W]
+	const T* const T_grad,			// node: [W]
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+	// integrated [T] averaged heat equation
+	//                  _
+	//    ____   1  1  dT
+	//	- T'w' + -- -- -- = const 
+	//			 Re Pr dz
+	//    (1)      (2)
+	// (1) - turbulent heat flux
+	// (2) - heat stress
+	// terms are defined at [W] nodes (averaged in [W] nodes)
+	//
+	// [T'w', dT/dz] have to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(heat_balance, \
+	turbulent_heat_flux, heat_stress)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) { // all [W] nodes
+		turbulent_heat_flux[k] = -Tw_flux[k];
+		heat_stress[k] = c_diffusivity * T_grad[k];
+
+		heat_balance[k] = turbulent_heat_flux[k] + heat_stress[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// SVA production
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::SVA_production(
+	T* _SVA_production,			// node: [C]
+
+	const T* const CW_bottom,	// node: [C (W -- C)]
+	const T* const CW_top,		// node: [C (W -- C)]
+	const T* const C,			// node: [C]
+	const T* const W,			// node: [W]
+	const wstGrid3d< T >& grid)
+	// production term of scalar variance equation defined at [C] node
+	//            _
+	//    ____   dC
+	//	- c'w' * -- 
+	//			 dz
+	// [W] average has to be known at all [W] nodes, including walls
+	// [C] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_SVA_production)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_SVA_production[k] = -(T) 0.5 * (
+			// discretization is based on ADV. form //
+			// correct for SKEW.form (checked) //
+			(CW_bottom[k] - C[k] * W[k]) * (C[k] - C[k - 1]) * grid.dzi[k] +
+			(CW_top[k] - C[k] * W[k + 1]) * (C[k + 1] - C[k]) * grid.dzi[k]);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// SVA transport
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::SVA_transport(
+	T* _SVA_transport,			// node: [C]
+
+	const T* const cc_w_flux,	// node: [W]
+	const wstGrid3d< T >& grid)
+	// transport term of scalar variance equation defined at [C] node
+	//       ______________
+	//      d[(c')^2 * w')] 
+	//	-  ---------------- 
+	//			 dz
+	// [c'c'w'] flux has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_SVA_transport)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_SVA_transport[k] = -(T) 0.5 *
+			(cc_w_flux[k + 1] - cc_w_flux[k]) * grid.dzi[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// SVA dissipation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::SVA_dissipation(
+	T* _SVA_dissipation,			// node: [C]
+
+	const T* const C_dissipation,	// node: [C]
+	const T* const C,				// node: [C]
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+	// dissipation component of scalar variance equation defined at [C] node
+	//       ___________
+	//          d^2(c')
+	//	 k * c' ------- 
+	//			dx(j)^2
+	// [C] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_SVA_dissipation)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_SVA_dissipation[k] = C_dissipation[k] -
+			c_diffusivity * (C[k] *
+			((C[k + 1] - C[k]) * grid.dzp2i[k] - (C[k] - C[k - 1]) * grid.dzm2i[k]));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// SVA iso dissipation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::SVA_iso_dissipation(
+	T* _SVA_iso_dissipation,			// node: [C]
+
+	const T* const C_iso_dissipation,	// node: [C]
+	const T* const C,					// node: [C]
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+	// isotropic dissipation component of scalar variance equation defined at [C] node
+	//       _____________
+	//       d(c')   d(c')
+	// - k * ----- * -----  
+	//		 dx(j)   dx(j)
+	// [C] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_SVA_iso_dissipation)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_SVA_iso_dissipation[k] = -(C_iso_dissipation[k] -
+			c_diffusivity * (
+			(T)0.5 * (C[k + 1] - C[k]) * (C[k + 1] - C[k]) * grid.dzp2i[k] +
+			(T)0.5 * (C[k] - C[k - 1]) * (C[k] - C[k - 1]) * grid.dzm2i[k]));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// c'ui' flux budget: production
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::cu_production_shear(
+	T* _cu_production_shear,		// node: [C]
+
+	const T* const CW_bottom_u,		// node: [C (W -- C)]
+	const T* const CW_top_u,		// node: [C (W -- C)]
+	const T* const C,				// node: [C]
+	const T* const U,				// node: [C]
+	const T* const W,				// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [shear] term of [c'u'] budget equation defined at [C] node
+	//           _
+	//   ____   dU
+	// - c'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+	// [U] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cu_production_shear)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_cu_production_shear[k] = -(T)0.5 * (
+			// discretization is based on ADV. form //
+			(CW_bottom_u[k] - C[k] * W[k]) * (U[k] - U[k - 1]) * grid.dzi[k] +
+			(CW_top_u[k] - C[k] * W[k + 1]) * (U[k + 1] - U[k]) * grid.dzi[k]);
+	}
+}
+
+template< typename T >
+void nse::cu_production_gradC(
+	T* _cu_production_gradC,		// node: [C]
+
+	const T* const UW_bottom,		// node: [C (W -- C)]
+	const T* const UW_top,			// node: [C (W -- C)]
+	const T* const C,				// node: [C]
+	const T* const U,				// node: [C]
+	const T* const W,				// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [gradC] term of [c'u'] budget equation defined at [C] node
+	//           _
+	//   ____   dC
+	// - u'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+	// [C] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cu_production_gradC)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_cu_production_gradC[k] = -(T)0.5 * (
+			// discretization is based on ADV. form //
+			(UW_bottom[k] - U[k] * W[k]) * (C[k] - C[k - 1]) * grid.dzi[k] +
+			(UW_top[k] - U[k] * W[k + 1]) * (C[k + 1] - C[k]) * grid.dzi[k]);
+	}
+}
+
+
+template< typename T >
+void nse::cv_production_shear(
+	T* _cv_production_shear,		// node: [C]
+
+	const T* const CW_bottom_v,		// node: [C (W -- C)]
+	const T* const CW_top_v,		// node: [C (W -- C)]
+	const T* const C,				// node: [C]
+	const T* const V,				// node: [C]
+	const T* const W,				// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [shear] term of [c'v'] budget equation defined at [C] node
+	//           _
+	//   ____   dV
+	// - c'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+	// [V] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cv_production_shear)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_cv_production_shear[k] = -(T)0.5 * (
+			// discretization is based on ADV. form //
+			(CW_bottom_v[k] - C[k] * W[k]) * (V[k] - V[k - 1]) * grid.dzi[k] +
+			(CW_top_v[k] - C[k] * W[k + 1]) * (V[k + 1] - V[k]) * grid.dzi[k]);
+	}
+}
+
+template< typename T >
+void nse::cv_production_gradC(
+	T* _cv_production_gradC,		// node: [C]
+
+	const T* const VW_bottom,		// node: [C (W -- C)]
+	const T* const VW_top,			// node: [C (W -- C)]
+	const T* const C,				// node: [C]
+	const T* const V,				// node: [C]
+	const T* const W,				// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [gradC] term of [c'v'] budget equation defined at [C] node
+	//           _
+	//   ____   dC
+	// - v'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+	// [C] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cv_production_gradC)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_cv_production_gradC[k] = -(T)0.5 * (
+			// discretization is based on ADV. form //
+			(VW_bottom[k] - V[k] * W[k]) * (C[k] - C[k - 1]) * grid.dzi[k] +
+			(VW_top[k] - V[k] * W[k + 1]) * (C[k + 1] - C[k]) * grid.dzi[k]);
+	}
+}
+
+
+template< typename T >
+void nse::cw_production_shear(
+	T* _cw_production_shear,		// node: [W]
+
+	const T* const CW_bottom_w,		// node: [W (C -- W)]
+	const T* const CW_top_w,		// node: [W (C -- W)]
+	const T* C,						// node: [C]
+	const T* W,						// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [shear] term of [c'w'] budget equation defined at [W] node
+	//           _
+	//   ____   dW
+	// - c'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cw_production_shear)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {	// all [W] nodes, excluding walls
+		_cw_production_shear[k] = - (
+			// discretization is based on ADV. form //
+			(CW_bottom_w[k] - (T)0.25 * (C[k] + C[k - 1]) * (W[k] + W[k - 1])) * (W[k] - W[k - 1]) * grid.dzmi[k] +
+			(CW_top_w[k] - (T)0.25 * (C[k] + C[k - 1]) * (W[k] + W[k + 1])) * (W[k + 1] - W[k]) * grid.dzmi[k]);
+	}
+
+	w_dirichlet_bc_z(_cw_production_shear, (T)0, (T)0, grid);
+}
+
+template< typename T >
+void nse::cw_production_gradC(
+	T* _cw_production_gradC,	// node: [W]
+
+	const T* const W2_w,		// node: [W]
+	const T* const W2_c,		// node: [C]
+	const T* C,					// node: [C]
+	const T* W,					// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [gradC] term of [c'w'] budget equation defined at [W] node
+	//           _
+	//   ____   dC
+	// - w'w' * --
+	//          dz
+	//
+	// [W^2, W] averages have to be known at all [W] nodes, including walls
+	// [C] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cw_production_gradC)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {	// all [W] nodes, excluding walls
+		_cw_production_gradC[k] = - (
+			// discretization is based on ADV. form //
+			(W2_c[k] - W[k] * W[k + 1]) * (C[k + 1] - C[k]) * grid.dziq[k] +
+			(W2_c[k - 1] - W[k] * W[k - 1]) * (C[k - 1] - C[k - 2]) * grid.dziq[k - 1] +
+
+			(W2_w[k] - W[k] * W[k]) * (C[k] - C[k - 1]) * (grid.dziq[k] + grid.dziq[k - 1]));
+	}
+
+	w_dirichlet_bc_z(_cw_production_gradC, (T)0, (T)0, grid);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// c'ui' flux budget: transport
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::cu_transport(T* _cu_transport,	// node: [C]
+
+	const T* const cuw_flux,				// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// transport term of [c'u'] equation defined at [C] node
+	//       ______________
+	//      d[(c'u') * u'] 
+	//	-  ---------------- 
+	//			 dz
+	// [c'u'w'] flux has be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cu_transport)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_cu_transport[k] = -(cuw_flux[k + 1] - cuw_flux[k]) * grid.dzi[k];
+	}
+}
+
+template< typename T >
+void nse::cv_transport(T* _cv_transport,	// node: [C]
+
+	const T* const cvw_flux,				// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// transport term of [c'v'] equation defined at [C] node
+	//       ______________
+	//      d[(c'v') * v'] 
+	//	-  ---------------- 
+	//			 dz
+	// [c'v'w'] flux has be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cv_transport)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_cv_transport[k] = -(cvw_flux[k + 1] - cvw_flux[k]) * grid.dzi[k];
+	}
+}
+
+template< typename T >
+void nse::cw_transport(T* _cw_transport,	// node: [W]
+
+	const T* const cww_flux,				// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// transport term of [c'w'] budget equation defined at [W] node
+	//       ______________
+	//      d[(c'w') * w'] 
+	//	-  ---------------- 
+	//			 dz
+	// [c'w'w'] flux has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cw_transport)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_cw_transport[k] = -(cww_flux[k] - cww_flux[k - 1]) * (T)2.0 * grid.dzmi[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// c'ui' flux budget: pressure scrambles
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::cw_pressure_work(T* _cw_pressure_work,	// node: [W]
+
+	const T* const cp_flux,							// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// pressure work term of [c'w'] budget equation defined at [W]
+	//       ______
+	//      d[c'p'] 
+	//	-  -------- 
+	//		  dz
+	// [c'p'] flux has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cw_pressure_work)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_cw_pressure_work[k] = -(cp_flux[k] - cp_flux[k - 1]) * (T)2.0 * grid.dzmi[k];
+	}
+}
+
+
+template< typename T >
+void nse::cu_pressure_gradc(
+	T* _cu_pressure_gradc,			// node: [C]
+
+	const T* const c_dpdx_turb,		// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	//  p'*dc'/dx = { d(c'p')/dx = 0 } - c'*dp'/dx
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cu_pressure_gradc)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_cu_pressure_gradc[k] = - c_dpdx_turb[k];
+	}
+}
+
+template< typename T >
+void nse::cv_pressure_gradc(
+	T* _cv_pressure_gradc,				// node: [C]
+
+	const T* const c_dpdy_turb,			// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	//  p'*dc'/dy = { d(c'p')/dy = 0 } - c'*dp'/dy
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cv_pressure_gradc)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_cv_pressure_gradc[k] = -c_dpdy_turb[k];
+	}
+}
+
+template< typename T >
+void nse::cw_pressure_gradc(
+	T* _cw_pressure_gradc,					// node: [W]
+
+	const T* const _cw_pressure_work,		// node: [W]
+	const T* const c_dpdz_turb,				// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	//  p'*dc'/dz = d(c'p')/dz - c'*dp'/dz
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cw_pressure_gradc)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_cw_pressure_gradc[k] = -_cw_pressure_work[k] - c_dpdz_turb[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// c'ui' flux budget: dissipation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::cu_dissipation(
+	T* _cu_dissipation,					// node: [C]
+
+	const T* const CU_dissipation,		// node: [C]
+	const T* const C,					// node: [C]
+	const T* const U,					// node: [C]
+	const T c_diffusivity, const T c_kinematic_viscosity,
+	const wstGrid3d< T >& grid)
+	//
+	//       ___________       ___________
+	//          d^2(c')           d^2(u')
+	//	 k * u' ------- + nu * c' ------- 
+	//			dx(j)^2           dx(j)^2
+	//
+	// [U, C] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+	T C_diffusion, U_diffusion;
+
+#pragma omp parallel for private(k, C_diffusion, U_diffusion) shared(_cu_dissipation)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		
+		C_diffusion = c_diffusivity *
+			((C[k + 1] - C[k]) * grid.dzp2i[k] - (C[k] - C[k - 1]) * grid.dzm2i[k]);
+		U_diffusion = c_kinematic_viscosity *
+			((U[k + 1] - U[k]) * grid.dzp2i[k] - (U[k] - U[k - 1]) * grid.dzm2i[k]);
+
+		_cu_dissipation[k] = CU_dissipation[k] -
+			(C[k] * U_diffusion + C_diffusion * U[k]);
+	}
+}
+
+template< typename T >
+void nse::cv_dissipation(
+	T* _cv_dissipation,					// node: [C]
+
+	const T* const CV_dissipation,		// node: [C]
+	const T* const C,					// node: [C]
+	const T* const V,					// node: [C]
+	const T c_diffusivity, const T c_kinematic_viscosity,
+	const wstGrid3d< T >& grid)
+	//
+	//       ___________       ___________
+	//          d^2(c')           d^2(v')
+	//	 k * v' ------- + nu * c' ------- 
+	//			dx(j)^2           dx(j)^2
+	//
+	// [V, C] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+	T C_diffusion, V_diffusion;
+
+#pragma omp parallel for private(k, C_diffusion, V_diffusion) shared(_cv_dissipation)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+
+		C_diffusion = c_diffusivity *
+			((C[k + 1] - C[k]) * grid.dzp2i[k] - (C[k] - C[k - 1]) * grid.dzm2i[k]);
+		V_diffusion = c_kinematic_viscosity *
+			((V[k + 1] - V[k]) * grid.dzp2i[k] - (V[k] - V[k - 1]) * grid.dzm2i[k]);
+
+		_cv_dissipation[k] = CV_dissipation[k] -
+			(C[k] * V_diffusion + C_diffusion * V[k]);
+	}
+}
+
+template< typename T >
+void nse::cw_dissipation(
+	T* _cw_dissipation,					// node: [W]
+
+	const T* const CW_dissipation,		// node: [W]
+	const T* const C,					// node: [C]
+	const T* const W,					// node: [W]
+	const T c_diffusivity, const T c_kinematic_viscosity,
+	const wstGrid3d< T >& grid)
+	//
+	//       ___________       ___________
+	//          d^2(c')           d^2(w')
+	//	 k * w' ------- + nu * c' ------- 
+	//			dx(j)^2           dx(j)^2
+	//
+	// [CW-dissipation, W] averages have to be known at all [W] nodes, including walls
+	// [C] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	T *C_diffusion, *W_diffusion;
+	int c_buf_id = memStx::get_buf(&C_diffusion, grid.nz);
+	int w_buf_id = memStx::get_buf(&W_diffusion, grid.nz);
+
+	null(C_diffusion, grid.nz);
+	null(W_diffusion, grid.nz);
+
+	int k;
+
+#pragma omp parallel for private(k) shared(C_diffusion, W_diffusion)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		C_diffusion[k] = c_diffusivity *
+			((C[k + 1] - C[k]) * grid.dzp2i[k] - (C[k] - C[k - 1]) * grid.dzm2i[k]);
+		W_diffusion[k] = c_kinematic_viscosity *
+			((W[k + 1] - W[k]) * grid.dzm2i[k] - (W[k] - W[k - 1]) * grid.dzp2i[k - 1]);
+	}
+
+	// assuming W = 0 & ~linear[laplace(W)] = 0 at walls -> no need for laplace(C) b.c.
+	//
+	w_dirichlet_bc_z(W_diffusion, (T)0, (T)0, grid);
+
+#pragma omp parallel for private(k) shared(_cw_dissipation,\
+	C_diffusion, W_diffusion)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_cw_dissipation[k] = CW_dissipation[k] -
+			(T)0.5 * (
+			(C[k] + C[k - 1]) * W_diffusion[k] +
+			W[k] * (C_diffusion[k] + C_diffusion[k - 1]));
+	}
+
+	memStx::free_buf(c_buf_id);
+	memStx::free_buf(w_buf_id);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// c'w' flux budget: buoyancy
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::cw_buoyancy(
+	T* _cw_buoyancy,			// node: [W]
+
+	const T* const C2_c,		// node: [C]
+	const T* const C2_w,		// node: [W]
+	const T* const C,			// node: [C]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	//
+	//      ____
+	// Ri * c'c'
+	//
+	// [C^2 at W-node] average has to be known at all [W] nodes, including walls
+	// [C, C^2 at C-node] averages have be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_cw_buoyancy)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_cw_buoyancy[k] = (T)0.25 * c_Richardson * (
+			C2_c[k] - C[k] * C[k] +
+			C2_c[k - 1] - C[k - 1] * C[k - 1] + 
+			(T)2.0 * (C2_w[k] - C[k] * C[k - 1])
+			);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+// TPE structure
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::TPE_structure(
+	T* TPE,											// node: [C]
+	T* TPE_heat_flux, T* TPE_diffusion,				// node: [C]
+	T* TPE_dissipation, T* TPE_iso_dissipation,		// node: [C]
+
+	const T* const TVA_production,		// node: [C]
+	const T* const TVA_diffusion,		// node: [C]
+	const T* const TVA_dissipation,		// node: [C]
+	const T* const TVA_iso_dissipation,	// node: [C]
+
+	const T* const T2,					// node: [C]
+	const T* const Tc,					// node: [C]
+	const T* const T_grad,				// node: [W]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	//
+	// TPE = 1/2*T'T'*Ri(b)/[dT/dz]
+	// TPE balance: [temperature variance] * (Ri(b)/[dT/dz])
+	//
+	// [dT/dz] has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(TPE,\
+	TPE_heat_flux, TPE_diffusion, TPE_dissipation, TPE_iso_dissipation)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		TPE[k] = (T)0.5 * (T2[k] - Tc[k] * Tc[k]) *
+			(c_Richardson / ((T)0.5 * (T_grad[k] + T_grad[k + 1])));
+
+		TPE_heat_flux[k] = TVA_production[k] *
+			(c_Richardson / ((T)0.5 * (T_grad[k] + T_grad[k + 1])));
+		TPE_diffusion[k] = TVA_diffusion[k] *
+			(c_Richardson / ((T)0.5 * (T_grad[k] + T_grad[k + 1])));
+		TPE_dissipation[k] = TVA_dissipation[k] *
+			(c_Richardson / ((T)0.5 * (T_grad[k] + T_grad[k + 1])));
+		TPE_iso_dissipation[k] = TVA_iso_dissipation[k] *
+			(c_Richardson / ((T)0.5 * (T_grad[k] + T_grad[k + 1])));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// Energy structure
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::energy_structure(
+	T* TKE_share, T* TPE_share,		// node: [C]
+
+	const T* const TKE,				// node: [C]
+	const T* const TPE,				// node: [C]
+	const wstGrid3d< T >& grid)
+	// 
+	// TKE-TPE shares
+	//
+{
+	int k;
+#pragma omp parallel for private(k) shared(TKE_share, TPE_share)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		TKE_share[k] = TKE[k] / (TKE[k] + TPE[k]);
+		TPE_share[k] = TPE[k] / (TKE[k] + TPE[k]);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+
diff --git a/bl-turb.h b/bl-turb.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e0943e98b89be50a001e6f8b7f881fccc95d2d3
--- /dev/null
+++ b/bl-turb.h
@@ -0,0 +1,1774 @@
+#pragma once
+
+// [bl-turb.h]: boundary-layer turbulence statistics and budgets
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "wstgrid3d.h"
+
+//
+// *[Note]: we need gcz >= 2 for computation of production terms:
+//         _            _
+//        dU           dV
+// - w'w' -- ,  - w'w' --
+//        dz           dz
+//
+
+
+namespace nse
+{
+	// Friction velocity (get)
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	T dynamic_velocity(T* U_z,
+		const T Umax, const T c_kinematic_viscosity, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// Gradients
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void c_gradient_z(T* Grad,		// node: [W]
+		const T* const C,			// node: [C]
+
+		const nse_const3d::axisType axis, const wstGrid3d< T >& grid);	// [axisZ || axisYZ]
+
+	template< typename T >
+	void w_gradient_z(T* Grad,		// node: [C]
+		const T* const W,			// node: [W]
+
+		const nse_const3d::axisType axis, const wstGrid3d< T >& grid);	// [axisZ || axisYZ]
+	// -------------------------------------------------------------------------------------------- //
+
+	// Momentum eq. balance
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void momentum_eq(
+		T* momentum_balance,			// node: [W]
+		T* turbulent_momentum_flux,		// node: [W]
+		T* viscous_stress,				// node: [W]
+
+		const T* const uw_flux,			// node: [W]
+		const T* const U_grad,			// node: [W]
+		const T c_kinematic_viscosity, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// TKE structure
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void TKE_structure(T* TKE,								// node: [C]
+		T* u_TKE, T* v_TKE, T* w_TKE,						// node: [C]
+		T* u_TKE_share, T* v_TKE_share, T* w_TKE_share,		// node: [C]
+
+		const T* const U2,		// node: [C]
+		const T* const V2,		// node: [C]
+		const T* const W2,		// node: [C]
+		const T* const U,		// node: [C]
+		const T* const V,		// node: [C]
+		const T* const W,		// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// TKE anisotropy
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void TKE_anisotropy(
+		T* TKE_aniso_uu, T* TKE_aniso_vv, T* TKE_aniso_ww,	// node: [C]
+		T* TKE_aniso_uv, T* TKE_aniso_uw, T* TKE_aniso_vw,	// node: [C]
+
+		const T* const TKE,			// node: [C]
+		const T* const u_TKE,		// node: [C]
+		const T* const v_TKE,		// node: [C]
+		const T* const w_TKE,		// node: [C]
+		const T* const uv_flux,		// node: [C]
+		const T* const uw_flux,		// node: [W]
+		const T* const vw_flux,		// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// TKE production
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void u_TKE_production(T* _u_TKE_production,		// node: [C]
+
+		const T* const UW_bottom,					// node: [W]
+		const T* const UW_top,						// node: [W]
+		const T* const U,							// node: [C]
+		const T* const W,							// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void v_TKE_production(T* _v_TKE_production,		// node: [C]
+
+		const T* const VW_bottom,					// node: [W]
+		const T* const VW_top,						// node: [W]
+		const T* const V,							// node: [C]
+		const T* const W,							// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void w_TKE_production(T* _w_TKE_production,		// node: [C]
+
+		const T* const W2_c,						// node: [C]
+		const T* const W2_w,						// node: [W]
+		const T* const W,							// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+
+	// TKE transport
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >	// -1/2 * d[u'u'w']/dz
+	void u_TKE_transport(T* _u_TKE_transport,	// node: [C]
+
+		const T* const uu_w_flux,				// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >	// -1/2 * d[v'v'w']/dz
+	void v_TKE_transport(T* _v_TKE_transport,	// node: [C]
+
+		const T* const vv_w_flux,				// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >	// -1/2 * d[w'w'w']/dz
+	void w_TKE_transport(T* _w_TKE_transport,	// node: [C]
+
+		const T* const ww_w_flux,				// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// TKE pressure work
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >	// - d[p'w']/dz
+	void w_TKE_pressure_work(T* _w_TKE_pressure_work,	// node: [C]
+
+		const T* const pw_flux,							// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// TKE exchange
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void w_TKE_exchange(T* _w_TKE_energy_exchange,	// node: [C]
+		
+		const T* const PSww,						// node: [C]
+		const T* const Pressure,					// node: [C]
+		const T* const W,							// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// TKE dissipation
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void u_TKE_dissipation(T* _u_TKE_dissipation,	// node: [C]
+
+		const T* const U_dissipation,				// node: [C]
+		const T* const U,							// node: [C]
+		const T c_kinematic_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void v_TKE_dissipation(T* _v_TKE_dissipation,	// node: [C]
+
+		const T* const V_dissipation,				// node: [C]
+		const T* const V,							// node: [C]
+		const T c_kinematic_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void w_TKE_dissipation(T* _w_TKE_dissipation,	// node: [C]
+
+		const T* const W_dissipation,				// node: [W]
+		const T* const W,							// node: [W]
+		const T c_kinematic_viscosity, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// TKE iso dissipation
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void u_TKE_iso_dissipation(T* _u_TKE_iso_dissipation,	// node: [C]
+
+		const T* const U_iso_dissipation,					// node: [C]
+		const T* const U,									// node: [C]
+		const T c_kinematic_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void v_TKE_iso_dissipation(T* _v_TKE_iso_dissipation,	// node: [C]
+
+		const T* const V_iso_dissipation,					// node: [C]
+		const T* const V,									// node: [C]
+		const T c_kinematic_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void w_TKE_iso_dissipation(T* _w_TKE_iso_dissipation,	// node: [C]
+
+		const T* const W_iso_dissipation,					// node: [W]
+		const T* const W,									// node: [W]
+		const T c_kinematic_viscosity, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// TKE heat flux
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void w_TKE_heat_flux(T* _w_TKE_heat_flux,	// node: [C]
+
+		const T* const Tw_flux,					// node: [W]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+
+	// ui'uj' flux budget: production
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void uv_production_shearU(T* _uv_production_shearU,		// node: [C]
+
+		const T* const VW_bottom_uv,		// node: [~W-C]
+		const T* const VW_top_uv,			// node: [~W-C]
+		const T* const U,					// node: [C]
+		const T* const V,					// node: [C]
+		const T* const W,					// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uv_production_shearV(T* _uv_production_shearV,		// node: [C]
+
+		const T* const UW_bottom_uv,		// node: [~W-C]
+		const T* const UW_top_uv,			// node: [~W-C]
+		const T* const U,					// node: [C]
+		const T* const V,					// node: [C]
+		const T* const W,					// node: [W]
+		const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void uw_production_shearU(T* _uw_production_shearU,		// node: [W]
+
+		const T* const W2_u,				// node: [C]
+		const T* const W2_uw,				// node: [W]
+		const T* const U,					// node: [C]
+		const T* const W,					// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uw_production_shearW(T* _uw_production_shearW,		// node: [W]
+
+		const T* const UW_bottom_uw,		// node: [~C-W]
+		const T* const UW_top_uw,			// node: [~C-W]
+		const T* const U,					// node: [C]
+		const T* const W,					// node: [W]
+		const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void vw_production_shearV(T* _vw_production_shearV,		// node: [W]
+
+		const T* const W2_v,				// node: [C]
+		const T* const W2_vw,				// node: [W]
+		const T* const V,					// node: [C]
+		const T* const W,					// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_production_shearW(T* _vw_production_shearW,		// node: [W]
+
+		const T* const VW_bottom_vw,		// node: [~C-W]
+		const T* const VW_top_vw,			// node: [~C-W]
+		const T* const V,					// node: [C]
+		const T* const W,					// node: [W]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// ui'uj' flux budget: transport
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void uv_transport(T* _uv_transport,		// node: [C]
+
+		const T* const uvw_flux,			// node: [W]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uw_transport(T* _uw_transport,		// node: [W]
+
+		const T* const uww_flux,			// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_transport(T* _vw_transport,		// node: [W]
+
+		const T* const vww_flux,			// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// ui'uj' flux budget: pressure-strain correlations: 2 * p' * S'ij, i != j
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void uw_pressure_strain(
+		T* P2Suw_turb,		// node: [W]
+		T* P2Suw_turb_c,	// node: [C] (shifting [W] -> [C])
+
+		const T* const P2Suw,		// node: [W]
+		const T* const Pressure,	// node: [C]
+		const T* const U,			// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_pressure_strain(
+		T* P2Svw_turb,		// node: [W]
+		T* P2Svw_turb_c,	// node: [C] (shifting [W] -> [C])
+
+		const T* const P2Svw,		// node: [C]
+		const T* const Pressure,	// node: [C]
+		const T* const V,			// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// ui'uj' flux budget: pressure work
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void uw_pressure_work(T* _uw_pressure_work,		// node: [W]
+
+		const T* const pu_flux,						// node: [C]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_pressure_work(T* _vw_pressure_work,		// node: [W]
+
+		const T* const pv_flux,						// node: [C]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// ui'uj' flux budget: dissipation
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void uv_dissipation(T* _uv_dissipation,		// node: [C]
+
+		const T* const UV_dissipation,			// node: [C]
+		const T* const U,						// node: [C]
+		const T* const V,						// node: [C]
+		const T c_kinematic_viscosity,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uw_dissipation(T* _uw_dissipation,		// node: [W]
+
+		const T* const UW_dissipation,			// node: [W]
+		const T* const U,						// node: [C]
+		const T* const W,						// node: [W]
+		const T c_kinematic_viscosity,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_dissipation(T* _vw_dissipation,		// node: [W]
+
+		const T* const VW_dissipation,			// node: [W]
+		const T* const V,						// node: [C]
+		const T* const W,						// node: [W]
+		const T c_kinematic_viscosity,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// ui'uj' flux budget: iso dissipation
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void uv_iso_dissipation(T* _uv_iso_dissipation,		// node: [C]
+
+		const T* const UV_iso_dissipation,				// node: [C]
+		const T* const U,								// node: [C]
+		const T* const V,								// node: [C]
+		const T c_kinematic_viscosity,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uw_iso_dissipation(T* _uw_iso_dissipation,		// node: [W]
+
+		const T* const UW_iso_dissipation,				// node: [W]
+		const T* const U,								// node: [C]
+		const T* const W,								// node: [W]
+		const T c_kinematic_viscosity,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_iso_dissipation(T* _vw_iso_dissipation,		// node: [W]
+
+		const T* const VW_iso_dissipation,				// node: [W]
+		const T* const V,								// node: [C]
+		const T* const W,								// node: [W]
+		const T c_kinematic_viscosity,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// ui'uj' flux budget: buoyancy
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void uw_buoyancy(T* _uw_buoyancy,	// node: [W]
+
+		const T* const CU_uw,			// node: [W]
+		const T* const C,				// node: [C]
+		const T* const U,				// node: [C]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_buoyancy(T* _vw_buoyancy,	// node: [W]
+
+		const T* const CV_vw,			// node: [W]
+		const T* const C,				// node: [C]
+		const T* const V,				// node: [C]
+		const T c_Richardson, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+}
+
+
+// Friction velocity (get)
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+T nse::dynamic_velocity(T* U_z,
+	const T Umax, const T c_kinematic_viscosity, const wstGrid3d< T >& grid)
+{
+	T u_bottom = (T)0, u_top = (T)0;
+
+	if (grid.mpi_com.rank_z == 0)
+		u_bottom = sqrt((T) 2.0 * c_kinematic_viscosity *
+		fabs(U_z[grid.gcz] + (T) 0.5 * Umax) * (T) 2.0 * grid.dzmi[grid.gcz]);
+
+	if (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1)
+		u_top = sqrt((T) 2.0 * c_kinematic_viscosity *
+		fabs((T) 0.5 * Umax - U_z[grid.nz - grid.gcz - 1]) * (T) 2.0 * grid.dzpi[grid.nz - grid.gcz - 1]);
+
+	mpi_allreduce(&u_bottom, &u_top, MPI_MAX, grid.mpi_com.comm);
+
+	return (T) 0.5 * (u_bottom + u_top);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// Gradients
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::c_gradient_z(
+	T* Grad,					// node: [W]
+
+	const T* const C,			// node: [C]
+	const nse_const3d::axisType axis, const wstGrid3d< T >& grid)
+	//  __
+	//  dC
+	//  --
+	//	dz
+	//
+	// [C] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	if (axis == nse_const3d::axisZ) {
+
+		int k;
+#pragma omp parallel for private(k) shared(Grad)
+		for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+			Grad[k] = (C[k] - C[k - 1]) * (T) 2.0 * grid.dzmi[k];
+		}
+		return;
+	}
+	if (axis == nse_const3d::axisYZ) {
+
+		int j, k, idx;
+#pragma omp parallel for private(j, k, idx) shared(Grad)
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+				idx = j * grid.nz + k;
+				Grad[idx] = (C[idx] - C[idx - 1]) * (T) 2.0 * grid.dzmi[k];
+			}
+		return;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::w_gradient_z(
+	T* Grad,					// node: [C]
+
+	const T* const W,			// node: [W]
+	const nse_const3d::axisType axis, const wstGrid3d< T >& grid)
+	//  __
+	//  dW
+	//  --
+	//	dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+{
+	if (axis == nse_const3d::axisZ) {
+
+		int k;
+#pragma omp parallel for private(k) shared(Grad)
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			Grad[k] = (W[k + 1] - W[k]) * grid.dzi[k];
+		}
+		return;
+	}
+	if (axis == nse_const3d::axisYZ) {
+
+		int j, k, idx;
+#pragma omp parallel for private(j, k, idx) shared(Grad)
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+				idx = j * grid.nz + k;
+				Grad[idx] = (W[idx + 1] - W[idx]) * grid.dzi[k];
+			}
+		return;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// Momentum eq. balance
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::momentum_eq(
+	T* momentum_balance,			// node: [W]
+	T* turbulent_momentum_flux,		// node: [W]
+	T* viscous_stress,				// node: [W]
+
+	const T* const uw_flux,			// node: [W]
+	const T* const U_grad,			// node: [W]
+	const T c_kinematic_viscosity, const wstGrid3d< T >& grid)
+	// integrated [U] averaged momentum equation
+	//               _
+	//    ____   1  dU
+	//	- u'w' + -- -- = (u*)^2 
+	//			 Re dz
+	//    (1)      (2)
+	// (1) - turbulent momentum flux
+	// (2) - viscous stress
+	// terms are defined at [UW] nodes (averaged in [W] nodes)
+	//
+	// [u'w', dU/dz] have to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(momentum_balance, \
+	turbulent_momentum_flux, viscous_stress)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) { // all [W] nodes
+		turbulent_momentum_flux[k] = -uw_flux[k];
+		viscous_stress[k] = c_kinematic_viscosity * U_grad[k];
+
+		momentum_balance[k] = turbulent_momentum_flux[k] + viscous_stress[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// TKE structure
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::TKE_structure(
+	T* TKE,												// node: [C]
+	T* u_TKE, T* v_TKE, T* w_TKE,						// node: [C]
+	T* u_TKE_share, T* v_TKE_share, T* w_TKE_share,		// node: [C]
+	
+	const T* const U2,		// node: [C]
+	const T* const V2,		// node: [C]
+	const T* const W2,		// node: [C]
+	const T* const U,		// node: [C]
+	const T* const V,		// node: [C]
+	const T* const W,		// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// 1/2*u'[i]*u'[i]
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(TKE, u_TKE, v_TKE, w_TKE, \
+	u_TKE_share, v_TKE_share, w_TKE_share)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		u_TKE[k] = (T)0.5 * (U2[k] - U[k] * U[k]);
+		v_TKE[k] = (T)0.5 * (V2[k] - V[k] * V[k]);
+		w_TKE[k] = (T)0.5 * (W2[k] - W[k] * W[k + 1]);
+
+		TKE[k] = u_TKE[k] + v_TKE[k] + w_TKE[k];
+
+		u_TKE_share[k] = u_TKE[k] / TKE[k];
+		v_TKE_share[k] = v_TKE[k] / TKE[k];
+		w_TKE_share[k] = w_TKE[k] / TKE[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// TKE anisotropy
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::TKE_anisotropy(
+	T* TKE_aniso_uu, T* TKE_aniso_vv, T* TKE_aniso_ww,		// node: [C]
+	T* TKE_aniso_uv, T* TKE_aniso_uw, T* TKE_aniso_vw,		// node: [C]
+
+	const T* const TKE,		// node: [C]
+	const T* const u_TKE,	// node: [C]
+	const T* const v_TKE,	// node: [C]
+	const T* const w_TKE,	// node: [C]
+	const T* const uv_flux,	// node: [C]
+	const T* const uw_flux,	// node: [W]
+	const T* const vw_flux,	// node: [W]
+
+	const wstGrid3d< T >& grid)
+	//
+	// TKE anisotropy symmetric zero-trace tensor:
+	// [(u(i)u(j)) / (u(k)u(k))] - 1/3 * delta(i,j)
+	//
+	// [u'w', v'w'] fluxes have to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(TKE_aniso_uu, TKE_aniso_vv, TKE_aniso_ww, \
+	TKE_aniso_uv, TKE_aniso_uw, TKE_aniso_vw)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+
+		TKE_aniso_uu[k] = (u_TKE[k] / TKE[k]) - ((T)1.0 / (T)3.0);
+		TKE_aniso_vv[k] = (v_TKE[k] / TKE[k]) - ((T)1.0 / (T)3.0);
+		TKE_aniso_ww[k] = (w_TKE[k] / TKE[k]) - ((T)1.0 / (T)3.0);
+
+		TKE_aniso_uv[k] = (uv_flux[k] / ((T)2.0*TKE[k]));
+		TKE_aniso_uw[k] = ((uw_flux[k] + uw_flux[k + 1]) / ((T)4.0*TKE[k]));
+		TKE_aniso_vw[k] = ((vw_flux[k] + vw_flux[k + 1]) / ((T)4.0*TKE[k]));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// TKE production
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::u_TKE_production(
+	T* _u_TKE_production,		// node: [C]
+
+	const T* const UW_bottom,	// node: [W]
+	const T* const UW_top,		// node: [W]
+	const T* const U,			// node: [C]
+	const T* const W,			// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// [U] component of T.K.E. equation defined at [C] node via interpolation
+	//            _
+	//    ____   dU
+	//	- u'w' * -- 
+	//			 dz
+	// [W] average has to be known at all [W] nodes, including walls
+	// [U] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_u_TKE_production)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_u_TKE_production[k] = -(T) 0.5 * (
+			// discretization is based on ADV. form //
+			// correct for SKEW.form (checked) //
+			(UW_bottom[k] - U[k] * W[k]) * (U[k] - U[k - 1]) * grid.dzi[k] +
+			(UW_top[k] - U[k] * W[k + 1]) * (U[k + 1] - U[k]) * grid.dzi[k]);
+	}
+}
+
+template< typename T >
+void nse::v_TKE_production(
+	T* _v_TKE_production,		// node: [C]
+
+	const T* const VW_bottom,	// node: [W]
+	const T* const VW_top,		// node: [W]
+	const T* const V,			// node: [C]
+	const T* const W,			// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// [V] component of T.K.E. equation defined at [C] node via interpolation
+	//            _
+	//    ____   dV
+	//	- v'w' * -- 
+	//			 dz
+	// [W] average has to be known at all [W] nodes, including walls
+	// [V] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_v_TKE_production)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_v_TKE_production[k] = -(T) 0.5 * (
+			// discretization is based on ADV. form //
+			// correct for SKEW.form (checked) //
+			(VW_bottom[k] - V[k] * W[k]) * (V[k] - V[k - 1]) * grid.dzi[k] +
+			(VW_top[k] - V[k] * W[k + 1]) * (V[k + 1] - V[k]) * grid.dzi[k]);
+	}
+}
+template< typename T >
+void nse::w_TKE_production(
+	T* _w_TKE_production,		// node: [C]
+
+	const T* const W2_c,		// node: [C]
+	const T* const W2_w,		// node: [W]
+	const T* const W,			// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// [W] component of T.K.E. equation defined at [C] node via interpolation
+	//            _
+	//    ____   dW
+	//	- w'w' * -- 
+	//			 dz
+	// [W] average has to be known at all [W] nodes, including walls
+{
+	T* w_production;
+	int buf_id = memStx::get_buf(&w_production, grid.nz);
+
+	int k;
+
+	// computing at [W] nodes:
+#pragma omp parallel for private(k) shared(w_production)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		w_production[k] = -(
+			(W2_w[k] + W2_c[k] - W[k] * W[k] - W[k] * W[k + 1]) * (W[k + 1] - W[k]) * grid.dzmih[k] +
+			(W2_w[k] + W2_c[k - 1] - W[k] * W[k] - W[k] * W[k - 1]) * (W[k] - W[k - 1]) * grid.dzmih[k]);
+	}
+
+	// setting boundary conditions:
+	w_dirichlet_bc_z(w_production, (T)0, (T)0, grid);
+
+	// interpolation [W]->[C]:
+#pragma omp parallel for private(k) shared(_w_TKE_production, w_production)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_w_TKE_production[k] =
+			(T) 0.5 * (w_production[k] + w_production[k + 1]);
+	}
+
+	memStx::free_buf(buf_id);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// TKE transport
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::u_TKE_transport(
+	T* _u_TKE_transport,		// node: [C]
+
+	const T* const uu_w_flux,	// node: [W]
+	const wstGrid3d< T >& grid)
+	// [U] transport term of T.K.E. equation defined at [C] node via interpolation
+	//       _____________________
+	//      d[(1/2) * (u')^2 * w'] 
+	//	-  ------------------------ 
+	//				dz
+	// [u'u'w'] flux has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_u_TKE_transport)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_u_TKE_transport[k] = -(T) 0.5 *
+			(uu_w_flux[k + 1] - uu_w_flux[k]) * grid.dzi[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::v_TKE_transport(
+	T* _v_TKE_transport,		// node: [C]
+
+	const T* const vv_w_flux,	// node: [W]
+	const wstGrid3d< T >& grid)
+	// [V] transport term of T.K.E. equation defined at [C] node via interpolation
+	//       _____________________
+	//      d[(1/2) * (v')^2 * w'] 
+	//	-  ------------------------ 
+	//				dz
+	// [v'v'w'] flux has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_v_TKE_transport)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_v_TKE_transport[k] = -(T) 0.5 *
+			(vv_w_flux[k + 1] - vv_w_flux[k]) * grid.dzi[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::w_TKE_transport(
+	T* _w_TKE_transport,		// node: [C]
+
+	const T* const ww_w_flux,	// node: [C]
+	const wstGrid3d< T >& grid)
+	// [W] transport component of T.K.E. equation defined at [C] node via interpolation
+	//       _____________________
+	//      d[(1/2) * (w')^2 * w'] 
+	//	-  ------------------------ 
+	//				dz
+	// [w'w'w'] flux has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_w_TKE_transport)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_w_TKE_transport[k] = -(T) 0.5 * (
+			(ww_w_flux[k + 1] - ww_w_flux[k]) * grid.dzpi[k] +
+			(ww_w_flux[k] - ww_w_flux[k - 1]) * grid.dzmi[k]);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+// TKE pressure work
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::w_TKE_pressure_work(
+	T* _w_TKE_pressure_work,	// node: [C]
+
+	const T* const pw_flux,		// node: [W]
+	const wstGrid3d< T >& grid)
+	// [W] pressure work component of T.K.E. equation defined at [C] node via interpolation
+	//       ______
+	//      d[p'w'] 
+	//	-  -------- 
+	//		  dz
+	// [p'w'] flux has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_w_TKE_pressure_work)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_w_TKE_pressure_work[k] = -(pw_flux[k + 1] - pw_flux[k]) * grid.dzi[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// TKE exchange
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::w_TKE_exchange(
+	T* _w_TKE_exchange,			// node: [C]
+
+	const T* const PSww,		// node: [C]
+	const T* const Pressure,	// node: [C]
+	const T* const W,			// node: [W]
+	const wstGrid3d< T >& grid)
+	// [W] energy exchange component of T.K.E. equation defined at [C] node via interpolation
+	// ______
+	//   dw'
+	// p'---
+	//   dz
+	// [W] average has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_w_TKE_exchange)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_w_TKE_exchange[k] = PSww[k] - Pressure[k] * ((W[k + 1] - W[k]) * grid.dzi[k]);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// TKE dissipation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::u_TKE_dissipation(
+	T* _u_TKE_dissipation,			// node: [C]
+
+	const T* const U_dissipation,	// node: [C]
+	const T* const U,				// node: [C]
+	const T c_kinematic_viscosity, const wstGrid3d< T >& grid)
+	// [U] dissipation component of T.K.E. defined at [C] node via interpolation
+	//       ___________
+	//   1      d^2(u')
+	//	-- * u' ------- 
+	//	Re		dx(j)^2
+	// [U] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_u_TKE_dissipation)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) { 
+		_u_TKE_dissipation[k] = U_dissipation[k] -
+			c_kinematic_viscosity * (U[k] *
+			((U[k + 1] - U[k]) * grid.dzp2i[k] - (U[k] - U[k - 1]) * grid.dzm2i[k]));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::v_TKE_dissipation(
+	T* _v_TKE_dissipation,			// node: [C]
+
+	const T* const V_dissipation,	// node: [C]
+	const T* const V,				// node: [C]
+	const T c_kinematic_viscosity, const wstGrid3d< T >& grid)
+	// [V] dissipation component of T.K.E. defined at [C] node via interpolation
+	//       ___________
+	//   1      d^2(v')
+	//	-- * v' ------- 
+	//	Re		dx(j)^2 
+	// [V] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_v_TKE_dissipation)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_v_TKE_dissipation[k] = V_dissipation[k] -
+			c_kinematic_viscosity * (V[k] *
+			((V[k + 1] - V[k]) * grid.dzp2i[k] - (V[k] - V[k - 1]) * grid.dzm2i[k]));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::w_TKE_dissipation(
+	T* _w_TKE_dissipation,			// node: [C]
+
+	const T* const W_dissipation,	// node: [W]
+	const T* const W,				// node: [W]
+	const T c_kinematic_viscosity, const wstGrid3d< T >& grid)
+	// [W] dissipation component of T.K.E. defined at [C] node via interpolation
+	//       ___________
+	//   1      d^2(w')
+	//	-- * w' ------- 
+	//	Re		dx(j)^2 
+	// [W] average has to be known at all [W] nodes, including walls
+	// *Note:
+	//		computing dissipation at [W] nodes, setting 
+	//		boundary conditions for turbulence dissipation (assuming = 0 at walls)
+	//		and interpolating to [C] node
+{
+	T* w_diss;
+	int buf_id = memStx::get_buf(&w_diss, grid.nz);
+
+	int k;
+
+	// computing at [W] nodes:
+#pragma omp parallel for private(k) shared(w_diss)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		w_diss[k] = W_dissipation[k] -
+			c_kinematic_viscosity * (W[k] *
+			((W[k + 1] - W[k]) * grid.dzm2i[k] - (W[k] - W[k - 1]) * grid.dzp2i[k - 1]));
+	}
+
+	// setting boundary conditions:
+	w_dirichlet_bc_z(w_diss, (T)0, (T)0, grid);
+
+	// interpolation [W]->[C]:
+#pragma omp parallel for private(k) shared(_w_TKE_dissipation, w_diss)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_w_TKE_dissipation[k] = 
+			(T) 0.5 * (w_diss[k] + w_diss[k + 1]);
+	}
+
+	memStx::free_buf(buf_id);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// TKE iso dissipation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::u_TKE_iso_dissipation(
+	T* _u_TKE_iso_dissipation,			// node: [C] node
+
+	const T* const U_iso_dissipation,	// node: [C] node
+	const T* const U,					// node: [C] node
+	const T c_kinematic_viscosity, const wstGrid3d< T >& grid)
+	// [U] isotropic dissipation component of T.K.E. equation defined at [C] node via interpolation
+	//       _____________
+	//   1   d(u')   d(u')
+	//-	-- * ----- * -----
+	//	Re	 dx(j)   dx(j)
+	// [U] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_u_TKE_iso_dissipation)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_u_TKE_iso_dissipation[k] = -(U_iso_dissipation[k] -
+			c_kinematic_viscosity * (
+			(T)0.5 * (U[k + 1] - U[k]) * (U[k + 1] - U[k]) * grid.dzp2i[k] +
+			(T)0.5 * (U[k] - U[k - 1]) * (U[k] - U[k - 1]) * grid.dzm2i[k]));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::v_TKE_iso_dissipation(
+	T* _v_TKE_iso_dissipation,			// node: [C] node
+
+	const T* const V_iso_dissipation,	// node: [C] node
+	const T* const V,					// node: [C] node
+	const T c_kinematic_viscosity, const wstGrid3d< T >& grid)
+	// [V] isotropic dissipation component of T.K.E. equation defined at [C] node via interpolation
+	//       _____________
+	//   1   d(v')   d(v')
+	//-	-- * ----- * -----
+	//	Re	 dx(j)   dx(j)
+	// [V] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_v_TKE_iso_dissipation)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_v_TKE_iso_dissipation[k] = -(V_iso_dissipation[k] -
+			c_kinematic_viscosity * (
+			(T)0.5 * (V[k + 1] - V[k]) * (V[k + 1] - V[k]) * grid.dzp2i[k] +
+			(T)0.5 * (V[k] - V[k - 1]) * (V[k] - V[k - 1]) * grid.dzm2i[k]));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::w_TKE_iso_dissipation(
+	T* _w_TKE_iso_dissipation,			// node: [C]
+
+	const T* const W_iso_dissipation,	// node: [W]
+	const T* const W,					// node: [W]
+	const T c_kinematic_viscosity, const wstGrid3d< T >& grid)
+	// [W] isotropic dissipation component of T.K.E. equation defined at [C] node via interpolation
+	//       _____________
+	//   1   d(w')   d(w')
+	//-	-- * ----- * -----
+	//	Re	 dx(j)   dx(j)
+	// [W] average has to be known at all [W] nodes, including walls
+	// *Note:
+	//		computing isotropic dissipation at [W] nodes, setting 
+	//		boundary conditions for turbulence dissipation (assuming = 0 at walls)
+	//		and interpolating to [C] node
+{
+	T* w_iso_diss;
+	int buf_id = memStx::get_buf(&w_iso_diss, grid.nz);
+
+	int k;
+
+	// computing at [W] nodes:
+#pragma omp parallel for private(k) shared(w_iso_diss)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		w_iso_diss[k] = -(W_iso_dissipation[k] -
+			c_kinematic_viscosity * (
+			(T)0.5 * (W[k + 1] - W[k]) * (W[k + 1] - W[k]) * grid.dzm2i[k] +
+			(T)0.5 * (W[k] - W[k - 1]) * (W[k] - W[k - 1]) * grid.dzp2i[k - 1]));
+	}
+
+	// setting boundary conditions:
+	w_dirichlet_bc_z(w_iso_diss, (T)0, (T)0, grid);
+
+	// interpolation [W]->[C]:
+#pragma omp parallel for private(k) shared(_w_TKE_iso_dissipation, w_iso_diss)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_w_TKE_iso_dissipation[k] =
+			(T) 0.5 * (w_iso_diss[k] + w_iso_diss[k + 1]);
+	}
+
+	memStx::free_buf(buf_id);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// TKE heat flux
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::w_TKE_heat_flux(
+	T* _w_TKE_heat_flux,		// node: [C]
+
+	const T* const Tw_flux,		// node: [W]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	// [W] heat flux component of T.K.E. equation defined at [C] node via interpolation
+	//        ____
+	//   Ri * w'T'
+	//	
+	// [T'W'] flux has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_w_TKE_heat_flux)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++)
+		_w_TKE_heat_flux[k] = (T) 0.5 * c_Richardson * (Tw_flux[k] + Tw_flux[k + 1]);
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+// ui'uj' flux budget: production
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::uv_production_shearU(T* _uv_production_shearU,		// node: [C]
+
+	const T* const VW_bottom_uv,		// node: [~W-C]
+	const T* const VW_top_uv,			// node: [~W-C]
+	const T* const U,					// node: [C]
+	const T* const V,					// node: [C]
+	const T* const W,					// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [gradU] term of [u'v'] budget equation defined at [C] node
+	//           _
+	//   ____   dU
+	// - v'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+	// [U] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_uv_production_shearU)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+
+		_uv_production_shearU[k] =
+			-(VW_top_uv[k] - V[k] * W[k + 1]) * (U[k + 1] - U[k]) * grid.dzih[k]
+			- (VW_bottom_uv[k] - V[k] * W[k]) * (U[k] - U[k - 1]) * grid.dzih[k];
+	}
+}
+
+template< typename T >
+void nse::uv_production_shearV(T* _uv_production_shearV,		// node: [C]
+
+	const T* const UW_bottom_uv,		// node: [~W-C]
+	const T* const UW_top_uv,			// node: [~W-C]
+	const T* const U,					// node: [C]
+	const T* const V,					// node: [C]
+	const T* const W,					// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [gradV] term of [u'v'] budget equation defined at [C] node
+	//           _
+	//   ____   dV
+	// - u'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+	// [V] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_uv_production_shearV)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+
+		_uv_production_shearV[k] =
+			-(UW_top_uv[k] - U[k] * W[k + 1]) * (V[k + 1] - V[k]) * grid.dzih[k]
+			- (UW_bottom_uv[k] - U[k] * W[k]) * (V[k] - V[k - 1]) * grid.dzih[k];
+	}
+}
+
+
+template< typename T >
+void nse::uw_production_shearU(
+	T* _uw_production_shearU,		// node: [W]
+
+	const T* const W2_u,			// node: [C]
+	const T* const W2_uw,			// node: [W]
+	const T* U,						// node: [C]
+	const T* W,						// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [gradU] term of [u'w'] budget equation defined at [W] node
+	//           _
+	//   ____   dU
+	// - w'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+	// [U] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_uw_production_shearU)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {	// all [W] nodes, excluding walls
+
+		_uw_production_shearU[k] =
+			-(W2_u[k] - W[k] * W[k + 1]) * (U[k + 1] - U[k]) * grid.dziq[k]
+			- (W2_u[k - 1] - W[k] * W[k - 1]) * (U[k - 1] - U[k - 2]) * grid.dziq[k - 1]
+
+			- (W2_uw[k] - W[k] * W[k]) * (U[k] - U[k - 1]) * (grid.dziq[k] + grid.dziq[k - 1]);
+	}
+
+	w_dirichlet_bc_z(_uw_production_shearU, (T)0, (T)0, grid);
+}
+
+template< typename T >
+void nse::uw_production_shearW(
+	T* _uw_production_shearW,			// node: [W]
+
+	const T* const UW_bottom_uw,		// node: [~C-W]
+	const T* const UW_top_uw,			// node: [~C-W]
+	const T* const U,					// node: [C]
+	const T* const W,					// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [gradW] term of [u'w'] budget equation defined at [W] node
+	//           _
+	//   ____   dW
+	// - u'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_uw_production_shearW)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {	// all [W] nodes, excluding walls
+		_uw_production_shearW[k] = -(
+			// discretization is based on ADV. form //
+			(UW_bottom_uw[k] - (T)0.25 * (U[k] + U[k - 1]) * (W[k] + W[k - 1])) * (W[k] - W[k - 1]) * grid.dzmi[k] +
+			(UW_top_uw[k] - (T)0.25 * (U[k] + U[k - 1]) * (W[k] + W[k + 1])) * (W[k + 1] - W[k]) * grid.dzmi[k]);
+	}
+
+	w_dirichlet_bc_z(_uw_production_shearW, (T)0, (T)0, grid);
+}
+
+
+template< typename T >
+void nse::vw_production_shearV(
+	T* _vw_production_shearV,		// node: [W]
+
+	const T* const W2_v,			// node: [C]
+	const T* const W2_vw,			// node: [W]
+	const T* V,						// node: [C]
+	const T* W,						// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [gradV] term of [v'w'] budget equation defined at [W] node
+	//           _
+	//   ____   dV
+	// - w'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+	// [V] average has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_vw_production_shearV)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {	// all [W] nodes, excluding walls
+
+		_vw_production_shearV[k] =
+			-(W2_v[k] - W[k] * W[k + 1]) * (V[k + 1] - V[k]) * grid.dziq[k]
+			- (W2_v[k - 1] - W[k] * W[k - 1]) * (V[k - 1] - V[k - 2]) * grid.dziq[k - 1]
+
+			- (W2_vw[k] - W[k] * W[k]) * (V[k] - V[k - 1]) * (grid.dziq[k] + grid.dziq[k - 1]);
+	}
+
+	w_dirichlet_bc_z(_vw_production_shearV, (T)0, (T)0, grid);
+}
+
+template< typename T >
+void nse::vw_production_shearW(
+	T* _vw_production_shearW,			// node: [W]
+
+	const T* const VW_bottom_vw,		// node: [~C-W]
+	const T* const VW_top_vw,			// node: [~C-W]
+	const T* const V,					// node: [C]
+	const T* const W,					// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// production [gradW] term of [v'w'] budget equation defined at [W] node
+	//           _
+	//   ____   dW
+	// - v'w' * --
+	//          dz
+	//
+	// [W] average has to be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_vw_production_shearW)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {	// all [W] nodes, excluding walls
+		_vw_production_shearW[k] = -(
+			// discretization is based on ADV. form //
+			(VW_bottom_vw[k] - (T)0.25 * (V[k] + V[k - 1]) * (W[k] + W[k - 1])) * (W[k] - W[k - 1]) * grid.dzmi[k] +
+			(VW_top_vw[k] - (T)0.25 * (V[k] + V[k - 1]) * (W[k] + W[k + 1])) * (W[k + 1] - W[k]) * grid.dzmi[k]);
+	}
+
+	w_dirichlet_bc_z(_vw_production_shearW, (T)0, (T)0, grid);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// ui'uj' flux budget: transport
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::uv_transport(
+	T* _uv_transport,			// node: [C]
+
+	const T* const uvw_flux,	// node: [W]
+	const wstGrid3d< T >& grid)
+	//
+	// transport term of [u'v'] budget equation defined at [C] node
+	//       ______________
+	//      d[(u'v') * w'] 
+	//	-  ---------------- 
+	//			 dz
+	// [u'v'w'] flux has be known at all [W] nodes, including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(_uv_transport)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_uv_transport[k] = -(uvw_flux[k + 1] - uvw_flux[k]) * grid.dzi[k];
+	}
+}
+
+
+template< typename T >
+void nse::uw_transport(
+	T* _uw_transport,			// node: [W]
+
+	const T* const uww_flux,	// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// transport term of [u'w'] budget equation defined at [W] node
+	//       ______________
+	//      d[(u'w') * w'] 
+	//	-  ---------------- 
+	//			 dz
+	// [u'w'w'] flux has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_uw_transport)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_uw_transport[k] = -(uww_flux[k] - uww_flux[k - 1]) * (T)2.0 * grid.dzmi[k];
+	}
+}
+
+
+template< typename T >
+void nse::vw_transport(
+	T* _vw_transport,			// node: [W]
+
+	const T* const vww_flux,	// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// transport term of [v'w'] budget equation defined at [W] node
+	//       ______________
+	//      d[(v'w') * w'] 
+	//	-  ---------------- 
+	//			 dz
+	// [v'w'w'] flux has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_vw_transport)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_vw_transport[k] = -(vww_flux[k] - vww_flux[k - 1]) * (T)2.0 * grid.dzmi[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// pressure-strain correlations: 2 * p' * S'ij, i != j
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::uw_pressure_strain(
+	T* P2Suw_turb,				// node: [W]
+	T* P2Suw_turb_c,			// node: [C] (shifting [W] -> [C])
+	
+	const T* const P2Suw,		// node: [W]
+	const T* const Pressure,	// node: [C]
+	const T* const U,			// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// pressure-strain term of [u'w'] budget equation defined at [W] node
+	//
+	// Qij = 2 * p'S'ij, i != j
+	//
+	// [U, P] averages have be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+	// [Quw] average has to be known at all [W] nodes including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(P2Suw_turb)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		P2Suw_turb[k] = P2Suw[k] -
+			(Pressure[k] + Pressure[k - 1]) * ((U[k] - U[k - 1]) * grid.dzmi[k]);
+	}
+
+	// interpolation [W] -> [C]:
+#pragma omp parallel for private(k) shared(P2Suw_turb_c, P2Suw_turb)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		P2Suw_turb_c[k] = (T)0.5 * (P2Suw_turb[k] + P2Suw_turb[k + 1]);
+	}
+}
+
+template< typename T >
+void nse::vw_pressure_strain(
+	T* P2Svw_turb,				// node: [W]
+	T* P2Svw_turb_c,			// node: [C] (shifting [W] -> [C])
+
+	const T* const P2Svw,		// node: [W]
+	const T* const Pressure,	// node: [C]
+	const T* const V,			// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// pressure-strain term of [v'w'] budget equation defined at [W] node
+	//
+	// Qij = 2 * p'S'ij, i != j
+	//
+	// [V, P] averages have be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+	// [Qvw] average has to be known at all [W] nodes including walls
+{
+	int k;
+#pragma omp parallel for private(k) shared(P2Svw_turb)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		P2Svw_turb[k] = P2Svw[k] -
+			(Pressure[k] + Pressure[k - 1]) * ((V[k] - V[k - 1]) * grid.dzmi[k]);
+	}
+
+	// interpolation [W] -> [C]:
+#pragma omp parallel for private(k) shared(P2Svw_turb_c, P2Svw_turb)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		P2Svw_turb_c[k] = (T)0.5 * (P2Svw_turb[k] + P2Svw_turb[k + 1]);
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// ui'uj' flux budget: pressure work
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::uw_pressure_work(
+	T* _uw_pressure_work,		// node: [W]
+
+	const T* const pu_flux,		// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// pressure work term of [u'w'] budget equation defined at [W] node
+	//    ____
+	//  d(p'u')
+	// --------
+	//    dz
+	//
+	// [p'u'] flux has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_uw_pressure_work)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_uw_pressure_work[k] = -(pu_flux[k] - pu_flux[k - 1]) * (T)2.0 * grid.dzmi[k];
+	}
+}
+
+
+template< typename T >
+void nse::vw_pressure_work(
+	T* _vw_pressure_work,		// node: [W]
+
+	const T* const pv_flux,		// node: [C]
+	const wstGrid3d< T >& grid)
+	//
+	// pressure work term of [v'w'] budget equation defined at [W] node
+	//    ____
+	//  d(p'v')
+	// --------
+	//    dz
+	//
+	// [p'v'] flux has to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_vw_pressure_work)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_vw_pressure_work[k] = -(pv_flux[k] - pv_flux[k - 1]) * (T)2.0 * grid.dzmi[k];
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+// ui'uj' flux budget: dissipation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::uv_dissipation(
+	T* _uv_dissipation,					// node: [C]
+
+	const T* const UV_dissipation,		// node: [C]
+	const T* const U,					// node: [C]
+	const T* const V,					// node: [C]
+	const T c_kinematic_viscosity,
+	const wstGrid3d< T >& grid)
+	//
+	// dissipation term of [u'v'] budget equation defined at [C] node
+	//       ___________       ___________
+	//          d^2(u')           d^2(v')
+	//	nu * v' ------- + nu * u' ------- 
+	//			dx(j)^2           dx(j)^2
+	//
+	// [U, V] average have be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	T U_diffusion, V_diffusion;
+	int k;
+
+#pragma omp parallel for private(k, U_diffusion, V_diffusion) shared(_uv_dissipation)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+
+		U_diffusion = c_kinematic_viscosity *
+			((U[k + 1] - U[k]) * grid.dzp2i[k] - (U[k] - U[k - 1]) * grid.dzm2i[k]);
+		V_diffusion = c_kinematic_viscosity *
+			((V[k + 1] - V[k]) * grid.dzp2i[k] - (V[k] - V[k - 1]) * grid.dzm2i[k]);
+
+		_uv_dissipation[k] = UV_dissipation[k] -
+			U[k] * V_diffusion - V[k] * U_diffusion;
+	}
+}
+
+
+template< typename T >
+void nse::uw_dissipation(
+	T* _uw_dissipation,					// node: [W]
+
+	const T* const UW_dissipation,		// node: [W]
+	const T* const U,					// node: [C]
+	const T* const W,					// node: [W]
+	const T c_kinematic_viscosity,
+	const wstGrid3d< T >& grid)
+	//
+	// dissipation term of [u'w'] budget equation defined at [W] node
+	//       ___________       ___________
+	//          d^2(u')           d^2(w')
+	//	nu * w' ------- + nu * u' ------- 
+	//			dx(j)^2           dx(j)^2
+	//
+	// [UW-dissipation, W] averages have to be known at all [W] nodes, including walls
+	// [U] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	T *U_diffusion, *W_diffusion;
+	int u_buf_id = memStx::get_buf(&U_diffusion, grid.nz);
+	int w_buf_id = memStx::get_buf(&W_diffusion, grid.nz);
+
+	null(U_diffusion, grid.nz);
+	null(W_diffusion, grid.nz);
+
+	int k;
+
+#pragma omp parallel for private(k) shared(U_diffusion, W_diffusion)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		U_diffusion[k] = c_kinematic_viscosity *
+			((U[k + 1] - U[k]) * grid.dzp2i[k] - (U[k] - U[k - 1]) * grid.dzm2i[k]);
+		W_diffusion[k] = c_kinematic_viscosity * 
+			((W[k + 1] - W[k]) * grid.dzm2i[k] - (W[k] - W[k - 1]) * grid.dzp2i[k - 1]);
+	}
+
+	// assuming W = 0 & ~linear[laplace(W)] = 0 at walls -> no need for laplace(U) b.c.
+	//
+	w_dirichlet_bc_z(W_diffusion, (T)0, (T)0, grid);
+
+#pragma omp parallel for private(k) shared(_uw_dissipation,\
+	U_diffusion, W_diffusion)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_uw_dissipation[k] = UW_dissipation[k] -
+			(T)0.5 * (
+			(U[k] + U[k - 1]) * W_diffusion[k] +
+			W[k] * (U_diffusion[k] + U_diffusion[k - 1]));
+	}
+
+	memStx::free_buf(u_buf_id);
+	memStx::free_buf(w_buf_id);
+}
+
+
+template< typename T >
+void nse::vw_dissipation(
+	T* _vw_dissipation,					// node: [W]
+
+	const T* const VW_dissipation,		// node: [W]
+	const T* const V,					// node: [C]
+	const T* const W,					// node: [W]
+	const T c_kinematic_viscosity,
+	const wstGrid3d< T >& grid)
+	//
+	// dissipation term of [v'w'] budget equation defined at [W] node
+	//       ___________       ___________
+	//          d^2(v')           d^2(w')
+	//	nu * w' ------- + nu * v' ------- 
+	//			dx(j)^2           dx(j)^2
+	//
+	// [VW-dissipation, W] averages have to be known at all [W] nodes, including walls
+	// [V] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	T *V_diffusion, *W_diffusion;
+	int v_buf_id = memStx::get_buf(&V_diffusion, grid.nz);
+	int w_buf_id = memStx::get_buf(&W_diffusion, grid.nz);
+
+	null(V_diffusion, grid.nz);
+	null(W_diffusion, grid.nz);
+
+	int k;
+
+#pragma omp parallel for private(k) shared(V_diffusion, W_diffusion)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		V_diffusion[k] = c_kinematic_viscosity *
+			((V[k + 1] - V[k]) * grid.dzp2i[k] - (V[k] - V[k - 1]) * grid.dzm2i[k]);
+		W_diffusion[k] = c_kinematic_viscosity *
+			((W[k + 1] - W[k]) * grid.dzm2i[k] - (W[k] - W[k - 1]) * grid.dzp2i[k - 1]);
+	}
+
+	// assuming W = 0 & ~linear[laplace(W)] = 0 at walls -> no need for laplace(U) b.c.
+	//
+	w_dirichlet_bc_z(W_diffusion, (T)0, (T)0, grid);
+
+#pragma omp parallel for private(k) shared(_vw_dissipation,\
+	V_diffusion, W_diffusion)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_vw_dissipation[k] = VW_dissipation[k] -
+			(T)0.5 * (
+			(V[k] + V[k - 1]) * W_diffusion[k] +
+			W[k] * (V_diffusion[k] + V_diffusion[k - 1]));
+	}
+
+	memStx::free_buf(v_buf_id);
+	memStx::free_buf(w_buf_id);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// ui'uj' flux budget: iso dissipation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::uv_iso_dissipation(
+	T* _uv_iso_dissipation,				// node: [C]
+
+	const T* const UV_iso_dissipation,	// node: [C]
+	const T* const U,					// node: [C]
+	const T* const V,					// node: [C]
+	const T c_kinematic_viscosity,
+	const wstGrid3d< T >& grid)
+	//
+	// isotropic dissipation term of [u'v'] budget equation defined at [C] node
+	//       _____________
+	//   2   d(u')   d(v')
+	//-	-- * ----- * -----
+	//	Re	 dx(j)   dx(j)
+	//
+	// *Note:
+	//		computing isotropic dissipation of average at [W] nodes, including walls,
+	//		and interpolating to [C] node
+	//
+	// [U, V] averages have be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	T *avgz;
+	int buf_id = memStx::get_buf(&avgz, grid.nz);
+
+	null(avgz, grid.nz);
+
+	int k;
+
+	// computing at [W] nodes
+#pragma omp parallel for private(k) shared(avgz)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes, including walls
+		avgz[k] = (T)2.0 * c_kinematic_viscosity *
+			((U[k] - U[k - 1]) * (T)2.0 * grid.dzmi[k]) *
+			((V[k] - V[k - 1]) * (T)2.0 * grid.dzmi[k]);
+	}
+
+	// averaging: [W] -> [C]
+#pragma omp parallel for private(k) shared(_uv_iso_dissipation, avgz)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		_uv_iso_dissipation[k] = -(UV_iso_dissipation[k] - (T)0.5 * (avgz[k] + avgz[k + 1]));
+	}
+
+	memStx::free_buf(buf_id);
+}
+
+
+template< typename T >
+void nse::uw_iso_dissipation(
+	T* _uw_iso_dissipation,				// node: [W]
+
+	const T* const UW_iso_dissipation,	// node: [W]
+	const T* const U,					// node: [C]
+	const T* const W,					// node: [W]
+	const T c_kinematic_viscosity,
+	const wstGrid3d< T >& grid)
+	//
+	// isotropic dissipation term of [u'w'] budget equation defined at [W] node
+	//       _____________
+	//   2   d(u')   d(w')
+	//-	-- * ----- * -----
+	//	Re	 dx(j)   dx(j)
+	//
+	// *Note:
+	//		computing isotropic dissipation of average at [C] nodes, setting 
+	//		boundary conditions (assuming = 0 at walls)
+	//		and interpolating to [W] node
+	//
+	// [UW-iso-dissipation, W] averages have to be known at all [W] nodes, including walls
+	// [U] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	T *avgz;
+	int buf_id = memStx::get_buf(&avgz, grid.nz);
+
+	null(avgz, grid.nz);
+
+	int k;
+
+#pragma omp parallel for private(k) shared(avgz)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		avgz[k] = (T)2.0 * c_kinematic_viscosity *
+			((U[k + 1] - U[k]) * grid.dzpi[k] + (U[k] - U[k - 1]) * grid.dzmi[k]) *
+			((W[k + 1] - W[k]) * grid.dzi[k]);
+	}
+
+	// assuming W = 0 & ~linear[laplace(W)] = 0 at walls -> dirichlet b.c.
+	//
+	c_dirichlet_bc_z(avgz, (T)0, (T)0, grid);
+
+#pragma omp parallel for private(k) shared(_uw_iso_dissipation, avgz)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_uw_iso_dissipation[k] = -(UW_iso_dissipation[k] - (T)0.5 * (avgz[k] + avgz[k - 1]));
+	}
+
+	memStx::free_buf(buf_id);
+}
+
+template< typename T >
+void nse::vw_iso_dissipation(
+	T* _vw_iso_dissipation,				// node: [W]
+
+	const T* const VW_iso_dissipation,	// node: [W]
+	const T* const V,					// node: [C]
+	const T* const W,					// node: [W]
+	const T c_kinematic_viscosity,
+	const wstGrid3d< T >& grid)
+	//
+	// isotropic dissipation term of [v'w'] budget equation defined at [W] node
+	//       _____________
+	//   2   d(v')   d(w')
+	//-	-- * ----- * -----
+	//	Re	 dx(j)   dx(j)
+	//
+	// *Note:
+	//		computing isotropic dissipation of average at [C] nodes, setting 
+	//		boundary conditions (assuming = 0 at walls)
+	//		and interpolating to [W] node
+	//
+	// [VW-iso-dissipation, W] averages have to be known at all [W] nodes, including walls
+	// [V] average has be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	T *avgz;
+	int buf_id = memStx::get_buf(&avgz, grid.nz);
+
+	null(avgz, grid.nz);
+
+	int k;
+
+#pragma omp parallel for private(k) shared(avgz)
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+		avgz[k] = (T)2.0 * c_kinematic_viscosity *
+			((V[k + 1] - V[k]) * grid.dzpi[k] + (V[k] - V[k - 1]) * grid.dzmi[k]) *
+			((W[k + 1] - W[k]) * grid.dzi[k]);
+	}
+
+	// assuming W = 0 & ~linear[laplace(W)] = 0 at walls -> dirichlet b.c.
+	//
+	c_dirichlet_bc_z(avgz, (T)0, (T)0, grid);
+
+#pragma omp parallel for private(k) shared(_vw_iso_dissipation, avgz)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_vw_iso_dissipation[k] = -(VW_iso_dissipation[k] - (T)0.5 * (avgz[k] + avgz[k - 1]));
+	}
+
+	memStx::free_buf(buf_id);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// ui'uj' flux budget: buoyancy
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::uw_buoyancy(
+	T* _uw_buoyancy,			// node: [W]
+
+	const T* const CU_uw,		// node: [W]
+	const T* const C,			// node: [C]
+	const T* const U,			// node: [C]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	//
+	// buoyancy term of [u'w'] budget equation defined at [W] node
+	//      ____
+	// Ri * u'c'
+	//
+	// [CU] average has to be known at all [W] nodes, including walls
+	// [C, U] averages have to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_uw_buoyancy)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_uw_buoyancy[k] = c_Richardson * (
+			CU_uw[k] - (T)0.25 * (C[k] + C[k - 1]) * (U[k] + U[k - 1]));
+	}
+}
+
+
+template< typename T >
+void nse::vw_buoyancy(
+	T* _vw_buoyancy,			// node: [W]
+
+	const T* const CV_vw,		// node: [W]
+	const T* const C,			// node: [C]
+	const T* const V,			// node: [C]
+	const T c_Richardson, const wstGrid3d< T >& grid)
+	//
+	// buoyancy term of [v'w'] budget equation defined at [W] node
+	//      ____
+	// Ri * v'c'
+	//
+	// [CV] average has to be known at all [W] nodes, including walls
+	// [C, V] averages have to be known at all [C] nodes and ghost nodes: (k + 1/2), (k - 1/2)
+{
+	int k;
+#pragma omp parallel for private(k) shared(_vw_buoyancy)
+	for (k = grid.gcz; k <= grid.nz - grid.gcz; k++) {	// all [W] nodes
+		_vw_buoyancy[k] = c_Richardson * (
+			CV_vw[k] - (T)0.25 * (C[k] + C[k - 1]) * (V[k] + V[k - 1]));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/cart-sys3d.h b/cart-sys3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e2b8c592183ae878967124063083a93684460bc
--- /dev/null
+++ b/cart-sys3d.h
@@ -0,0 +1,98 @@
+#pragma once
+
+// [cart-sys3d.h]: 3D cartesian axis type
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include <string.h>
+
+namespace nse
+{
+	namespace nse_const3d {
+		enum axisType {
+			axisX = 0, axisY = 1, axisZ = 2,
+			axisXY = 3, axisXZ = 4, axisYZ = 5, axisXYZ = 6
+		};
+	}
+
+	const char* axis_name(const nse_const3d::axisType axis);
+
+	bool axis_value(nse_const3d::axisType *axis, const char* name);
+
+	bool is_axis_1d(const nse_const3d::axisType axis);
+	bool is_axis_2d(const nse_const3d::axisType axis);
+	bool is_axis_3d(const nse_const3d::axisType axis);
+
+	namespace nse_const3d {
+		enum domainSideType {
+			westSide = 0, eastSide = 1,
+			southSide = 2, northSide = 3,
+			bottomSide = 4, topSide = 5
+		};
+	}
+}
+
+// Implementation:
+// -------------------------------------------------------------------------------------------- //
+inline const char* nse::axis_name(const nse_const3d::axisType axis) {
+	if (axis == nse_const3d::axisX) return "X";
+	if (axis == nse_const3d::axisY) return "Y";
+	if (axis == nse_const3d::axisZ) return "Z";
+	if (axis == nse_const3d::axisXY) return "XY";
+	if (axis == nse_const3d::axisXZ) return "XZ";
+	if (axis == nse_const3d::axisYZ) return "YZ";
+	if (axis == nse_const3d::axisXYZ) return "XYZ";
+	return "UNDEF";
+}
+
+inline bool nse::axis_value(nse_const3d::axisType *axis, const char* name) 
+{
+	if (!strcmp(name, axis_name(nse_const3d::axisX))) {
+		(*axis) = nse_const3d::axisX;
+		return true;
+	}
+	if (!strcmp(name, axis_name(nse_const3d::axisY))) {
+		(*axis) = nse_const3d::axisY;
+		return true;
+	}
+	if (!strcmp(name, axis_name(nse_const3d::axisZ))) {
+		(*axis) = nse_const3d::axisZ;
+		return true;
+	}
+	if (!strcmp(name, axis_name(nse_const3d::axisXY))) {
+		(*axis) = nse_const3d::axisXY;
+		return true;
+	}
+	if (!strcmp(name, axis_name(nse_const3d::axisXZ))) {
+		(*axis) = nse_const3d::axisXZ;
+		return true;
+	}
+	if (!strcmp(name, axis_name(nse_const3d::axisYZ))) {
+		(*axis) = nse_const3d::axisYZ;
+		return true;
+	}
+	if (!strcmp(name, axis_name(nse_const3d::axisXYZ))) {
+		(*axis) = nse_const3d::axisXYZ;
+		return true;
+	}
+
+	return false;
+}
+
+inline bool nse::is_axis_1d(const nse_const3d::axisType axis)
+{
+	return ((axis == nse_const3d::axisX) ||
+		(axis == nse_const3d::axisY) || (axis == nse_const3d::axisZ));
+}
+
+inline bool nse::is_axis_2d(const nse_const3d::axisType axis)
+{
+	return ((axis == nse_const3d::axisXY) ||
+		(axis == nse_const3d::axisXZ) || (axis == nse_const3d::axisYZ));
+}
+
+inline bool nse::is_axis_3d(const nse_const3d::axisType axis)
+{
+	return (axis == nse_const3d::axisXYZ);
+}
+
diff --git a/cfg-var.cpp b/cfg-var.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c48cac2804f4494b6beb6587f2a201021533913
--- /dev/null
+++ b/cfg-var.cpp
@@ -0,0 +1,464 @@
+#include "cfg-var.h"
+#include "str-com.h"
+
+#define _CRT_SECURE_NO_DEPRECATE
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+
+// CPP 
+namespace nse {
+
+	// a = b, b = a
+	template< typename T >
+	void swap_vars(T& a, T& b) {
+		T c = a; a = b; b = c;
+	}
+
+	// = x^p
+	int ipow(const int x, const int p)
+	{
+		if (p == 0) return 1;
+		if (p == 1) return x;
+
+		int tmp = ipow(x, p / 2);
+		if (p % 2 == 0) return tmp*tmp;
+		else
+			return x*tmp*tmp;
+	}
+}
+
+
+// 
+// Implementation: cfgVariable
+//
+nse::cfgVariable::cfgVariable(
+) :
+
+	type(IS_UNDEF),
+	eint(0), edouble((double)0), estring(NULL), ebool(false)
+{
+	name_memsize = 1;
+	name = new char[name_memsize];
+	*name = '\0';
+}
+
+nse::cfgVariable::cfgVariable(
+	const char* ex_name,
+	const char* ex_value) :
+
+	type(IS_UNDEF),
+	eint(0), edouble((double)0), estring(NULL), ebool(false)
+{
+	name_memsize = 1;
+	name = new char[name_memsize];
+	*name = '\0';
+
+	set(ex_name, ex_value);
+}
+
+nse::cfgVariable::cfgVariable(
+	const char* ex_value) :
+
+	type(IS_UNDEF),
+	eint(0), edouble((double)0), estring(NULL), ebool(false)
+{
+	name_memsize = 1;
+	name = new char[name_memsize];
+	*name = '\0';
+
+	set(NULL, ex_value);
+}
+
+
+nse::cfgVariable::~cfgVariable()
+{
+	clear();
+	delete[] name;
+}
+
+nse::cfgVariable::cfgVariable(const cfgVariable& var)
+{
+	type = var.type;
+
+	name_memsize = strlen(var.name) + 1;
+	name = new char[name_memsize];
+	strcpy(name, var.name);
+
+	if (type == IS_INT) eint = var.eint;
+	else
+		if (type == IS_DOUBLE) edouble = var.edouble;
+		else
+			if (type == IS_STRING) {
+				estring = new char[strlen(var.estring) + 1];
+				strcpy(estring, var.estring);
+			}
+			else
+				if (type == IS_BOOLEAN) ebool = var.ebool;
+}
+
+const nse::cfgVariable&
+nse::cfgVariable::operator=(cfgVariable var)
+{
+	swap(var);
+	return (*this);
+}
+
+void nse::cfgVariable::swap(
+	cfgVariable& var)
+{
+	nse::swap_vars(type, var.type);
+	nse::swap_vars(name, var.name);
+	nse::swap_vars(eint, var.eint);
+	nse::swap_vars(edouble, var.edouble);
+	nse::swap_vars(estring, var.estring);
+	nse::swap_vars(ebool, var.ebool);
+
+	nse::swap_vars(name_memsize, var.name_memsize);
+}
+
+nse::cfgVariable&
+nse::cfgVariable::operator+=(const cfgVariable& var)
+{
+	// (int) += (int)
+	// (double) += (double)
+	// (string) += (string)
+	if ((type == IS_INT) && (var.type == IS_INT)) {
+		eint += var.eint;
+	}
+	else
+		if ((type == IS_DOUBLE) && (var.type == IS_DOUBLE))
+		{
+			edouble += (double)var.edouble;
+		}
+		else
+			if ((type == IS_STRING) && (var.type == IS_STRING))
+			{
+				const size_t length = strlen(estring) + strlen(var.estring) + 1;
+				char *buf = new char[length];
+
+				strcpy(buf, estring);
+				strcat(buf, var.estring);
+
+				delete[] estring;
+				estring = buf;
+			}
+			else
+			{
+				clear();	// type:= IS_UNDEF
+			}
+
+	return (*this);
+}
+
+nse::cfgVariable&
+nse::cfgVariable::operator-=(const cfgVariable& var)
+{
+	// (int) -= (int)
+	// (double) -= (double)
+	if ((type == IS_INT) && (var.type == IS_INT)) {
+		eint -= var.eint;
+	}
+	else
+		if ((type == IS_DOUBLE) && (var.type == IS_DOUBLE)) {
+			edouble -= var.edouble;
+		}
+		else
+		{
+			clear();	// type:= IS_UNDEF
+		}
+
+	return (*this);
+}
+
+nse::cfgVariable&
+nse::cfgVariable::operator*=(const cfgVariable& var)
+{
+	// (int) *= (int)
+	// (double) *= (double)
+	if ((type == IS_INT) && (var.type == IS_INT)) {
+		eint *= var.eint;
+	}
+	else
+		if ((type == IS_DOUBLE) && (var.type == IS_DOUBLE)) {
+			edouble *= var.edouble;
+		}
+		else
+		{
+			clear();	// type:= IS_UNDEF
+		}
+
+	return (*this);
+}
+
+nse::cfgVariable&
+nse::cfgVariable::operator/=(const cfgVariable& var)
+{
+	// (int) /= (int)
+	// (double) /= (double)
+	if ((type == IS_INT) && (var.type == IS_INT)) {
+		eint /= var.eint;
+	}
+	else
+		if ((type == IS_DOUBLE) && (var.type == IS_DOUBLE)) {
+			edouble /= var.edouble;
+		}
+		else
+		{
+			clear();	// type:= IS_UNDEF
+		}
+
+	return (*this);
+}
+
+nse::cfgVariable&
+nse::cfgVariable::operator%=(const cfgVariable& var)
+{
+	// int %= (int)
+	if ((type == IS_INT) && (var.type == IS_INT)) {
+		eint %= var.eint;
+	}
+	else
+	{
+		clear();	// type:= IS_UNDEF
+	}
+
+	return (*this);
+}
+
+nse::cfgVariable&
+nse::cfgVariable::operator^=(const cfgVariable& var)
+{
+	// (int) ^= (int)
+	// (double) ^= (double)
+	if ((type == IS_INT) && (var.type == IS_INT)) {
+		eint = ipow(eint, var.eint);
+	}
+	else
+		if ((type == IS_DOUBLE) && (var.type == IS_DOUBLE)) {
+			edouble = pow(edouble, var.edouble);
+		}
+		else
+		{
+			clear();	// type:= IS_UNDEF
+		}
+
+	return (*this);
+}
+
+// check if we should really use const on return (all operators) ->
+//
+const nse::cfgVariable
+nse::cfgVariable::operator+(const cfgVariable& var) const
+{
+	return cfgVariable(*this) += var;
+}
+
+const nse::cfgVariable
+nse::cfgVariable::operator-(const cfgVariable& var) const
+{
+	return cfgVariable(*this) -= var;
+}
+
+const nse::cfgVariable
+nse::cfgVariable::operator*(const cfgVariable& var) const
+{
+	return cfgVariable(*this) *= var;
+}
+
+const nse::cfgVariable
+nse::cfgVariable::operator/(const cfgVariable& var) const
+{
+	return cfgVariable(*this) /= var;
+}
+
+const nse::cfgVariable
+nse::cfgVariable::operator%(const cfgVariable& var) const
+{
+	return cfgVariable(*this) %= var;
+}
+
+const nse::cfgVariable
+nse::cfgVariable::operator^(const cfgVariable& var) const
+{
+	return cfgVariable(*this) ^= var;
+}
+
+const nse::cfgVariable
+nse::cfgVariable::operator-() const
+{
+	// -(int)
+	// -(double)
+	cfgVariable var;	// default: type:= IS_UNDEF
+
+	if (type == IS_INT) {
+		var = (*this);
+		var.eint = -var.eint;
+	}
+	else
+		if (type == IS_DOUBLE) {
+			var = (*this);
+			var.edouble = -var.edouble;
+		}
+	return var;
+}
+
+const nse::cfgVariable
+nse::cfgVariable::operator+() const
+{
+	// +(int)
+	// +(double)
+	cfgVariable var;	// default: type:= IS_UNDEF
+
+	if (type == IS_INT) {
+		var = (*this);
+	}
+	else
+		if (type == IS_DOUBLE) {
+			var = (*this);
+		}
+	return var;
+}
+// <-
+//
+
+void nse::cfgVariable::set(const char* ex_name,
+	const char* ex_value)
+{
+	clear();		// default: type = IS_UNDEF
+	change_name(ex_name);
+
+	if (is_integer(ex_value, &eint)) type = IS_INT;
+	else
+		if (is_double(ex_value, &edouble)) type = IS_DOUBLE;
+		else
+			if (is_valid_string(ex_value))
+			{
+				type = IS_STRING;
+				estring = new char[strlen(ex_value) + 1];
+				strcpyrm(estring, ex_value, '"');
+				return;
+			}
+			else
+				if (is_boolean(ex_value, &ebool)) type = IS_BOOLEAN;
+}
+
+void nse::cfgVariable::change_name(const char* ex_name)
+{
+	if (ex_name == NULL) return;	// keeping old name
+
+	const size_t ex_length = strlen(ex_name) + 1;
+	if (ex_length > name_memsize) {
+		delete[] name;
+
+		name_memsize = ex_length;
+		name = new char[name_memsize];
+	}
+
+	strcpy(name, ex_name);
+}
+
+
+void nse::cfgVariable::clear()
+{
+	if (type == IS_STRING) {
+		delete[] estring;
+		estring = NULL;
+	}
+
+	type = IS_UNDEF;
+}
+
+int nse::cfgVariable::get_value(int* value) const
+{
+	if (type != IS_INT) return 0;
+
+	(*value) = eint;
+	return 1;
+}
+
+int nse::cfgVariable::get_value(float* value) const
+{
+	if (type != IS_DOUBLE) return 0;
+
+	(*value) = (float)edouble;
+	return 1;
+}
+
+int nse::cfgVariable::get_value(double* value) const
+{
+	if (type != IS_DOUBLE) return 0;
+
+	(*value) = edouble;
+	return 1;
+}
+
+int nse::cfgVariable::get_value(char** value) const
+{
+	if (type != IS_STRING) return 0;
+
+	(*value) = new char[strlen(estring) + 1];
+	strcpy(*value, estring);
+	return 1;
+}
+
+int nse::cfgVariable::get_value(std::string& value) const
+{
+	if (type != IS_STRING) return 0;
+
+	value = std::string(estring);
+	return 1;
+}
+
+int nse::cfgVariable::get_value(bool* value) const
+{
+	if (type != IS_BOOLEAN) return 0;
+
+	(*value) = ebool;
+	return 1;
+}
+
+
+nse::cfgVariable::VAR_TYPE
+nse::cfgVariable::get_type() const {
+	return type;
+}
+
+bool nse::cfgVariable::is_varname(const char* ex_name) const
+{
+	if (ex_name == NULL) return false;
+	return (!strcmp(ex_name, name));
+}
+
+bool nse::cfgVariable::is_valid_name() const
+{
+	return (strlen(name) > 0);
+}
+
+bool nse::cfgVariable::is_valid_type() const
+{
+	return (type != IS_UNDEF);
+}
+
+bool nse::cfgVariable::is_eq_name(const cfgVariable& var) const
+{
+	return (!strcmp(name, var.name));
+}
+
+void nse::cfgVariable::print() const
+{
+	if (type == IS_INT)
+		printf(" > INT '%s' = %i\n", name, eint);
+	else
+		if (type == IS_DOUBLE)
+			printf(" > DOUBLE '%s' = %f\n", name, edouble);
+		else
+			if (type == IS_STRING)
+				printf(" > STRING '%s' = %s\n", name, estring);
+			else
+				if (type == IS_BOOLEAN)
+					printf(" > BOOLEAN '%s' = %s\n", name, ebool ? "true" : "false");
+				else
+					printf(" > UNDEFINED '%s'\n", name);
+}
diff --git a/cfg-var.h b/cfg-var.h
new file mode 100644
index 0000000000000000000000000000000000000000..81f49b7728c3b3a2c56217caf6e40ac7aca09fea
--- /dev/null
+++ b/cfg-var.h
@@ -0,0 +1,79 @@
+#pragma once
+
+// [cfg-var.h]: configurable variable class 'cfgVariable'
+//
+// -------------------------------------------------------------------------------------------- //
+
+#define _CRT_SECURE_NO_WARNINGS
+#include <string>
+
+namespace nse
+{
+	class cfgVariable {
+	public:
+
+		enum VAR_TYPE { IS_UNDEF, IS_INT, IS_DOUBLE, IS_STRING, IS_BOOLEAN };
+
+		cfgVariable();
+		cfgVariable(const char* value);
+		cfgVariable(const char* name,
+			const char* value);
+		~cfgVariable();
+
+		cfgVariable(const cfgVariable& var);
+		const cfgVariable& operator=(cfgVariable var);
+		void swap(cfgVariable& a);
+
+		cfgVariable& operator+=(const cfgVariable& var);
+		cfgVariable& operator-=(const cfgVariable& var);
+		cfgVariable& operator*=(const cfgVariable& var);
+		cfgVariable& operator/=(const cfgVariable& var);
+		cfgVariable& operator%=(const cfgVariable& var);
+		cfgVariable& operator^=(const cfgVariable& var);
+
+		const cfgVariable operator+(const cfgVariable& var) const;
+		const cfgVariable operator-(const cfgVariable& var) const;
+		const cfgVariable operator*(const cfgVariable& var) const;
+		const cfgVariable operator/(const cfgVariable& var) const;
+		const cfgVariable operator%(const cfgVariable& var) const;
+		const cfgVariable operator^(const cfgVariable& var) const;
+
+		const cfgVariable operator-() const;
+		const cfgVariable operator+() const;
+
+		void set(const char* name, const char* value);
+		void change_name(const char* name);
+		// name == NULL - not changing variable name
+
+		void clear();
+
+		int get_value(int* value) const;
+		int get_value(float* value) const;
+		int get_value(double* value) const;
+		int get_value(char** value) const;
+		int get_value(std::string& value) const;
+		int get_value(bool* value) const;
+
+		VAR_TYPE get_type() const;
+
+		bool is_varname(const char* ex_name) const;
+		bool is_valid_name() const;	// name not empty string
+		bool is_valid_type() const;	// type != IS_UNDEF
+		bool is_eq_name(const cfgVariable& var) const;
+
+		void print() const;
+
+	private:
+
+		char* name;		// default name = empty string = "\0";
+						// always keeping name as string != NULL
+		VAR_TYPE type;
+
+		int eint;
+		double edouble;
+		char *estring;
+		bool ebool;
+
+		size_t name_memsize;	// memory handler for name string
+	};
+}
diff --git a/config-parser.cpp b/config-parser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5200eab8869c44757e9947ef108cb05e66be641d
--- /dev/null
+++ b/config-parser.cpp
@@ -0,0 +1,985 @@
+#define _CRT_SECURE_NO_DEPRECATE
+#include "config-parser.h"
+#include "str-com.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <stack>
+
+//
+// Implementation: helper classes
+//
+// ConfigParser::parserState
+nse::ConfigParser::parserState::parserState()
+{
+	idx = 0;
+	level = 0;
+
+	nalloc = c_alloc_init;
+	name_space = new char[c_alloc_init];
+	*name_space = '\0';
+}
+nse::ConfigParser::parserState::parserState(
+	const parserState& state)
+{
+	idx = state.idx;
+	level = state.level;
+	nalloc = state.nalloc;
+
+	name_space = new char[nalloc];
+	strcpy(name_space, state.name_space);
+}
+nse::ConfigParser::parserState::~parserState()
+{
+	delete[] name_space;
+}
+
+void nse::ConfigParser::parserState::truncate_name_space()
+{
+	truncate_name(name_space, '.');
+}
+void nse::ConfigParser::parserState::append_name_space(
+	const char* name)
+{
+	append_name(&name_space, &nalloc, name, '.');
+}
+
+//
+// ConfigParser::rpnList
+nse::ConfigParser::rpnList::rpnList()
+{
+	nexpr = 0;
+
+	nalloc = c_alloc_init;
+	expr = new int[nalloc];
+}
+nse::ConfigParser::rpnList::~rpnList()
+{
+	delete[] expr;
+}
+
+void nse::ConfigParser::rpnList::init()
+{
+	nexpr = 0;
+}
+
+void nse::ConfigParser::rpnList::add(const int idx)
+{
+	if (nexpr >= nalloc) {
+		int *buf = new int[nalloc + c_alloc_init];
+		memcpy(buf, expr, nalloc * sizeof(int));
+
+		delete[] expr;
+		expr = buf;
+	}
+
+	expr[nexpr] = idx;
+	nexpr++;
+}
+
+bool nse::ConfigParser::rpnList::empty()
+{
+	return nexpr == 0;
+}
+
+bool nse::ConfigParser::rpnList::convert(
+	parserState& state,
+	const LEXEME_TYPE *lexeme_type, const FileParser& parser)
+{
+	const int nlexeme = parser.get_ntokens();
+	std::stack<int> hstack;
+
+	init();
+
+	LEXEME_TYPE p_lex = IS_INVALID, lex;
+	bool status = true;
+	while ((state.idx < nlexeme) && (p_lex != IS_SEMICOLON))
+	{
+
+		lex = lexeme_type[state.idx];
+
+		if ((lex == IS_VALUE) || (lex == IS_NAME)) {
+			// adding value | variable reference
+			//	- postpone existence check to evaluation step
+			add(state.idx);
+		}
+		else
+			if (lex == IS_BRACKET_OPEN) {
+				hstack.push(state.idx);	// pushing '(' to stack
+			}
+			else
+				if (lex == IS_BRACKET_CLOSE) {
+					// pop till '(' found & remove '(' from stack
+
+					int idx;
+					bool flag = false;
+					while (!hstack.empty()) {
+						idx = hstack.top();
+						hstack.pop();
+						if (lexeme_type[idx] == IS_BRACKET_OPEN) {
+							flag = true;
+							break;
+						}
+						else
+						{
+							add(idx);
+						}
+					}
+
+					if (!flag) {
+						printf(" CONFIG:> missing bracket '(' (line, %i)\n",
+							parser.get_line_num(state.idx));
+					}
+					else
+						if (p_lex == IS_BRACKET_OPEN) {
+							printf(" CONFIG:> null sub-expression found '()' (line, %i)\n",
+								parser.get_line_num(state.idx));
+							flag = false;
+						}
+
+					status = status && flag;
+				}
+				else
+					if (is_op(lex))	// handling operators
+					{
+						int idx;
+						while (!hstack.empty()) {
+							idx = hstack.top();
+							if (!is_op(lexeme_type[idx])) break;
+
+							if (
+								((op_associativity(lex) == IS_OP_LEFT) &&
+								(!op_lt(lexeme_type[idx], lex))) ||
+
+								((op_associativity(lex) == IS_OP_RIGHT) &&
+								(op_lt(lex, lexeme_type[idx])))
+								)
+							{
+								hstack.pop();
+								add(idx);
+							}
+							else
+								break;
+						}
+						hstack.push(state.idx);
+					}
+					else
+						if (lex == IS_SEMICOLON) {
+							// removing elements until stack is empty or getting '('
+
+							int idx;
+							bool flag = true;
+							while (!hstack.empty()) {
+								idx = hstack.top();
+								hstack.pop();
+								if (lexeme_type[idx] == IS_BRACKET_OPEN) {
+									flag = false;
+									break;
+								}
+								else
+								{
+									add(idx);
+								}
+							}
+
+							if (!flag) {
+								printf(" CONFIG:> missing bracket ')' (line, %i)\n",
+									parser.get_line_num(state.idx));
+							}
+
+							status = status && flag;
+						}
+						else
+						{
+							printf(" CONFIG:> unexpected lexeme in expression: '%s' (line, %i)\n",
+								parser.get_token(state.idx), parser.get_line_num(state.idx));
+							status = false;
+						}
+
+		p_lex = lex;
+		state.idx++;
+	}
+
+	status = status && (!empty());
+	return status;
+}
+
+
+//
+// Implementation: ConfigureParser
+//
+nse::ConfigParser::ConfigParser()
+{
+	nvars = 0;
+	nalloc_vars = c_alloc_init;
+
+	var = new cfgVariable[nalloc_vars];
+}
+
+nse::ConfigParser::~ConfigParser()
+{
+	nvars = 0;
+	nalloc_vars = 0;
+
+	delete[] var;
+}
+
+bool nse::ConfigParser::is_op(const LEXEME_TYPE op)
+{
+	return (
+		(op == IS_OP_ADD) || (op == IS_OP_SUB) ||
+		(op == IS_OP_MUL) || (op == IS_OP_DIV) ||
+		(op == IS_OP_MOD) ||
+		(op == IS_OP_PLUS) || (op == IS_OP_MINUS) ||
+		(op == IS_OP_EXP));
+}
+
+bool nse::ConfigParser::is_op_binary(const LEXEME_TYPE op)
+{
+	return (
+		(op == IS_OP_ADD) || (op == IS_OP_SUB) ||
+		(op == IS_OP_MUL) || (op == IS_OP_DIV) ||
+		(op == IS_OP_MOD) ||
+		(op == IS_OP_EXP));
+}
+
+bool nse::ConfigParser::is_op_unary(const LEXEME_TYPE op)
+{
+	return ((op == IS_OP_PLUS) || (op == IS_OP_MINUS));
+}
+
+int nse::ConfigParser::op_priority(
+	const LEXEME_TYPE op)
+{
+	if ((op == IS_OP_ADD) || (op == IS_OP_SUB)) return 1;
+	if ((op == IS_OP_MUL) || (op == IS_OP_DIV) ||
+		(op == IS_OP_MOD)) return 2;
+	if ((op == IS_OP_PLUS) || (op == IS_OP_MINUS)) return 3;
+	if ((op == IS_OP_EXP)) return 4;
+
+	return 0;
+}
+
+nse::ConfigParser::OP_ASSOCIATIVITY
+nse::ConfigParser::op_associativity(
+	const LEXEME_TYPE op)
+{
+	if (op == IS_OP_EXP) return IS_OP_RIGHT;
+	else
+		return IS_OP_LEFT;
+}
+
+bool nse::ConfigParser::op_lt(
+	const LEXEME_TYPE opA, const LEXEME_TYPE opB)
+{
+	return (op_priority(opA) < op_priority(opB));
+}
+
+bool nse::ConfigParser::is_valid_name(const char* token)
+{
+	int token_length = strlen(token);
+
+	if (token_length == 0) return false;
+	if ((!isalpha(token[0])) && (token[0] != '_')) return false;
+
+	for (int i = 1; i < token_length - 1; i++) {
+		if ((!isalnum(token[i])) &&
+			(token[i] != '_') && (token[i] != '.')) return false;
+	}
+	if ((!isalnum(token[token_length - 1])) &&
+		(token[token_length - 1] != '_')) return false;
+
+	return true;
+}
+
+
+bool nse::ConfigParser::add(const cfgVariable& ex)
+{
+	if (!ex.is_valid_name()) return false;
+
+	// just overwriting variable if already exists
+	for (int k = 0; k < nvars; k++) {
+		if (var[k].is_eq_name(ex)) {
+			var[k] = ex;
+			return true;
+		}
+	}
+
+	if (nvars >= nalloc_vars)
+	{
+		cfgVariable *hvar = new cfgVariable[nalloc_vars + c_alloc_init];
+		for (int i = 0; i < nalloc_vars; i++) {
+			hvar[i] = var[i];
+		}
+
+		delete[] var;
+		var = hvar;
+
+		nalloc_vars += c_alloc_init;
+	}
+
+	var[nvars] = ex;
+	nvars++;
+	return true;
+}
+
+bool nse::ConfigParser::run_lexical_analysis(
+	LEXEME_TYPE *lexeme_type, const FileParser& parser)
+{
+	const int nlexeme = parser.get_ntokens();
+	if (nlexeme == 0) return true;
+
+	bool status = true;
+	for (int i = 0; i < nlexeme; i++) {
+
+		const char* lexeme = parser.get_token(i);
+
+		if (!strcmp(lexeme, "=")) lexeme_type[i] = IS_ASSIGNMENT;
+		else
+			if (!strcmp(lexeme, "{")) lexeme_type[i] = IS_BRACE_OPEN;
+			else
+				if (!strcmp(lexeme, "}")) lexeme_type[i] = IS_BRACE_CLOSE;
+				else
+					if (!strcmp(lexeme, "(")) lexeme_type[i] = IS_BRACKET_OPEN;
+					else
+						if (!strcmp(lexeme, ")")) lexeme_type[i] = IS_BRACKET_CLOSE;
+						else
+							if (!strcmp(lexeme, ";")) lexeme_type[i] = IS_SEMICOLON;
+							else
+								if (!strcmp(lexeme, "+"))
+								{
+									// checking for unary "+"
+									if (i == 0) lexeme_type[i] = IS_OP_PLUS;
+									else
+									{
+										if ((lexeme_type[i - 1] == IS_NAME) ||
+											(lexeme_type[i - 1] == IS_VALUE) ||
+											(lexeme_type[i - 1] == IS_BRACKET_CLOSE))
+										{
+											lexeme_type[i] = IS_OP_ADD;
+										}
+										else
+											lexeme_type[i] = IS_OP_PLUS;
+									}
+								}
+								else
+									if (!strcmp(lexeme, "-"))
+									{
+										// checking for unary "-"
+										if (i == 0) lexeme_type[i] = IS_OP_MINUS;
+										else
+										{
+											if ((lexeme_type[i - 1] == IS_NAME) ||
+												(lexeme_type[i - 1] == IS_VALUE) ||
+												(lexeme_type[i - 1] == IS_BRACKET_CLOSE))
+											{
+												lexeme_type[i] = IS_OP_SUB;
+											}
+											else
+												lexeme_type[i] = IS_OP_MINUS;
+										}
+									}
+									else
+										if (!strcmp(lexeme, "*")) lexeme_type[i] = IS_OP_MUL;
+										else
+											if (!strcmp(lexeme, "/")) lexeme_type[i] = IS_OP_DIV;
+											else
+												if (!strcmp(lexeme, "%")) lexeme_type[i] = IS_OP_MOD;
+												else
+													if (!strcmp(lexeme, "^")) lexeme_type[i] = IS_OP_EXP;
+													else
+														if (is_integer(lexeme) || is_double(lexeme) ||
+															is_valid_string(lexeme) || is_boolean(lexeme))
+														{
+															lexeme_type[i] = IS_VALUE;
+														}
+														else
+															if (is_valid_name(lexeme)) lexeme_type[i] = IS_NAME;
+															else
+															{
+																lexeme_type[i] = IS_INVALID;
+																printf(" CONFIG:> invalid lexeme: '%s' (line, %i)\n",
+																	lexeme, parser.get_line_num(i));
+																status = false;
+															}
+	}
+
+	return status;
+}
+
+const nse::cfgVariable
+nse::ConfigParser::evaluate_rpn(
+	const rpnList& rpn,
+	parserState state,
+	const LEXEME_TYPE *lexeme_type, const FileParser& parser) const
+{
+	LEXEME_TYPE lex;
+	std::stack<cfgVariable> dyn_expr;
+	int eidx;
+
+	for (int i = 0; i < rpn.nexpr; i++)
+	{
+		eidx = rpn.expr[i];
+		lex = lexeme_type[eidx];
+
+		if (is_op_binary(lex)) {
+			if (dyn_expr.size() >= 2) {
+				cfgVariable a = dyn_expr.top(); dyn_expr.pop();
+				cfgVariable b = dyn_expr.top(); dyn_expr.pop();
+
+				if (lex == IS_OP_ADD) b += a;
+				else
+					if (lex == IS_OP_SUB) b -= a;
+					else
+						if (lex == IS_OP_MUL) b *= a;
+						else
+							if (lex == IS_OP_DIV) b /= a;
+							else
+								if (lex == IS_OP_MOD) b %= a;
+								else
+									if (lex == IS_OP_EXP) b ^= a;
+									else
+									{
+										printf(" CONFIG:> unknown binary operation: '%s' (line, %i)\n",
+											parser.get_token(eidx), parser.get_line_num(eidx));
+										return cfgVariable();
+									}
+
+				if (!b.is_valid_type()) {
+					printf(" CONFIG:> incorrect types of operands: '%s' (line, %i)\n",
+						parser.get_token(eidx), parser.get_line_num(eidx));
+					return b;
+				}
+				dyn_expr.push(b);
+			}
+			else
+			{
+				printf(" CONFIG:> insufficient number of arguments: '%s' (line, %i)\n",
+					parser.get_token(eidx),
+					parser.get_line_num(eidx));
+				return cfgVariable();
+			}
+		}
+		else
+			if (is_op_unary(lex)) {
+				if (dyn_expr.size() >= 1) {
+					cfgVariable op = dyn_expr.top(); dyn_expr.pop();
+					cfgVariable res;
+
+					if (lex == IS_OP_PLUS) res = +op;
+					else
+						if (lex == IS_OP_MINUS) res = -op;
+						else
+						{
+							printf(" CONFIG:> unknown unary operation: '%s' (line, %i)\n",
+								parser.get_token(eidx), parser.get_line_num(eidx));
+							return cfgVariable();
+						}
+
+					if (!res.is_valid_type()) {
+						printf(" CONFIG:> incorrect type of operand: '%s' (line, %i)\n",
+							parser.get_token(eidx), parser.get_line_num(eidx));
+						return res;
+					}
+					dyn_expr.push(res);
+				}
+				else
+				{
+					printf(" CONFIG:> insufficient number of arguments: '%s' (line, %i)\n",
+						parser.get_token(eidx), parser.get_line_num(eidx));
+					return cfgVariable();
+				}
+			}
+			else
+				if (lex == IS_NAME) {
+
+					cfgVariable arg;
+					int pend = strlen(state.name_space);
+
+					state.append_name_space(parser.get_token(eidx));
+					if (is_varname(state.name_space))
+						arg = get_variable(state.name_space);
+					else
+					{
+						if (is_varname(parser.get_token(eidx))) {
+							arg = get_variable(parser.get_token(eidx));
+						}
+						else
+						{
+							printf(" CONFIG:> reference to undefined variable: '%s' (line, %i)\n",
+								parser.get_token(eidx), parser.get_line_num(eidx));
+							return cfgVariable();
+						}
+					}
+					state.name_space[pend] = '\0';	// removing added name resolution
+
+					dyn_expr.push(arg);
+				}
+				else
+				{
+					dyn_expr.push(
+						cfgVariable(parser.get_token(eidx)));
+				}
+	}
+
+	if (dyn_expr.size() != 1) return cfgVariable();
+
+	return cfgVariable(dyn_expr.top());	// delay res type checking [in add]
+}
+
+bool nse::ConfigParser::run_syntax_analysis(
+	const LEXEME_TYPE *lexeme_type, const FileParser& parser)
+{
+	const int nlexeme = parser.get_ntokens();
+	if (nlexeme == 0) return true;
+
+	parserState state;
+	rpnList rpn;
+
+	bool status = true;
+	while (state.idx < nlexeme) {
+
+		// - closing namespace [}]
+		if ((lexeme_type[state.idx] == IS_BRACE_CLOSE) && (state.level > 0)) {
+			state.truncate_name_space();
+			state.level--; state.idx++;
+			continue;
+		}
+
+		if (lexeme_type[state.idx] == IS_NAME) {
+			if (state.idx + 1 < nlexeme) {
+				// - open namespace [name {]
+				if (lexeme_type[state.idx + 1] == IS_BRACE_OPEN) {
+
+					// no dots allowed in namespace name at least for now
+					if (strchr(parser.get_token(state.idx), '.') == NULL) {
+						state.append_name_space(parser.get_token(state.idx));
+					}
+					else
+					{
+						status = false;
+						printf(" CONFIG:> '.' not allowed in namespaces: '%s' (line, %i)\n",
+							parser.get_token(state.idx), parser.get_line_num(state.idx));
+					}
+
+					state.level++; state.idx += 2;
+					continue;
+				}
+
+				// - assignment [name = value|string ;]
+				if (lexeme_type[state.idx + 1] == IS_ASSIGNMENT) {
+
+					int varidx = state.idx;
+					state.idx += 2;		// setting state at expression //
+
+					int pend = strlen(state.name_space);
+
+					// converting to postfix notation
+					if (!rpn.convert(state, lexeme_type, parser)) {
+
+						state.append_name_space(parser.get_token(varidx));
+
+						status = false;
+						printf(" CONFIG:> failed to process expression for variable: '%s' (line, %i)\n",
+							state.name_space, parser.get_line_num(varidx));
+
+						state.name_space[pend] = '\0';	// removing added name resolution
+						continue;
+					}
+
+					// evaluating postix expression (need namespace - not full variable name)
+					cfgVariable value = evaluate_rpn(rpn,
+						state, lexeme_type, parser);
+					if (!value.is_valid_type()) {
+
+						state.append_name_space(parser.get_token(varidx));
+
+						status = false;
+						printf(" CONFIG:> failed to evaluate expression for variable: '%s' (line, %i)\n",
+							state.name_space, parser.get_line_num(varidx));
+
+						state.name_space[pend] = '\0';	// removing added name resolution
+						continue;
+					}
+
+					state.append_name_space(parser.get_token(varidx));
+					value.change_name(state.name_space);
+
+					if (!add(value)) {
+						status = false;
+						printf(" CONFIG:> failed to add variable: '%s' (line, %i)\n",
+							state.name_space, parser.get_line_num(varidx));
+					}
+
+					state.name_space[pend] = '\0';	// removing added name resolution
+					continue;
+				}
+			}
+
+			status = false;
+			printf(" CONFIG:> expecting '=' or '{' for name: '%s' (line, %i)\n",
+				parser.get_token(state.idx), parser.get_line_num(state.idx));
+			state.idx += 2;
+			continue;
+		}
+
+		status = false;
+		printf(" CONFIG:> unexpected lexeme: '%s' (line, %i)\n",
+			parser.get_token(state.idx), parser.get_line_num(state.idx));
+		state.idx++;
+	}
+
+	if (state.level > 0) {
+		status = false;
+		printf(" CONFIG:> unclosed namespaces, missing '}'\n");
+	}
+
+	return status;
+}
+
+bool nse::ConfigParser::run(const char* filename)
+{
+	// removing elements in config -- but keeping memory
+	nvars = 0;
+
+	FileParser parser;
+	if (!parser.run(filename, '#', "={};()+-*/%^", true)) {
+		printf(" CONFIG:> parsing failed for file: '%s'\n", filename);
+		return false;
+	}
+
+	const int nlexeme = parser.get_ntokens();
+	if (nlexeme == 0) return true;
+
+	LEXEME_TYPE *lexeme_type = new LEXEME_TYPE[nlexeme];
+
+	bool status_lexical = run_lexical_analysis(lexeme_type, parser);
+	bool status_syntax = run_syntax_analysis(lexeme_type, parser);
+
+	delete[] lexeme_type;
+	return status_lexical & status_syntax;
+}
+
+bool nse::ConfigParser::mpi_run(const char* filename, const MPI_Comm comm)
+{
+	const int host_rank = 0;
+
+	int rank, status = 0;
+	MPI_Comm_rank(comm, &rank);
+
+	if (rank == host_rank) {
+		if (run(filename)) status = 1;
+	}
+	mpi_broadcast(&status, 1, host_rank);
+	return (status == 1);
+}
+
+bool nse::ConfigParser::mpi_run(const char* filename)
+{
+	return mpi_run(filename, MPI_COMM_WORLD);
+}
+
+
+bool nse::ConfigParser::is_varname(const char* name) const
+{
+	for (int i = 0; i < nvars; i++) {
+		if (var[i].is_varname(name)) return true;
+	}
+	return false;
+}
+
+const nse::cfgVariable
+nse::ConfigParser::get_variable(const int idx) const
+{
+	if ((idx < 0) || (idx >= nvars)) return cfgVariable();
+	return var[idx];
+}
+const nse::cfgVariable
+nse::ConfigParser::get_variable(const char* name) const
+{
+	for (int k = 0; k < nvars; k++) {
+		if (var[k].is_varname(name)) return var[k];
+	}
+	return cfgVariable();
+}
+
+void nse::ConfigParser::print() const
+{
+	for (int i = 0; i < nvars; i++) {
+		var[i].print();
+		getc(stdin);
+	}
+}
+
+bool nse::ConfigParser::get_value(const char* name, int* value) const
+{
+	cfgVariable var = get_variable(name);
+	cfgVariable::VAR_TYPE type = var.get_type();
+
+	if (type == cfgVariable::IS_INT) return (var.get_value(value) > 0);
+	else
+		if (type == cfgVariable::IS_DOUBLE)
+		{
+			printf(" CONFIG:> failed to set (int) variable: '%s' - declared as double\n",
+				name);
+			return false;
+		}
+		else
+			if (type == cfgVariable::IS_STRING) {
+				printf(" CONFIG:> failed to set (int) variable: '%s' - declared as string\n",
+					name);
+				return false;
+			}
+			else
+				if (type == cfgVariable::IS_BOOLEAN) {
+					printf(" CONFIG:> failed to set (int) variable: '%s' - declared as boolean\n",
+						name);
+					return false;
+				}
+
+	printf(" CONFIG:> failed to set (int) variable: '%s'\n", name);
+	return false;
+}
+
+bool nse::ConfigParser::get_value(const char* name, float* value) const
+{
+	cfgVariable var = get_variable(name);
+	cfgVariable::VAR_TYPE type = var.get_type();
+
+	if (type == cfgVariable::IS_INT)
+	{
+		int ival;
+		int status = var.get_value(&ival);
+		(*value) = (float)ival;
+		return (status > 0);
+	}
+	else
+		if (type == cfgVariable::IS_DOUBLE)
+		{
+			double dval;
+			int status = var.get_value(&dval);
+			(*value) = (float)dval;
+			return (status > 0);
+		}
+		else
+			if (type == cfgVariable::IS_STRING) {
+				printf(" CONFIG:> failed to set (float) variable: '%s' - declared as string\n",
+					name);
+				return false;
+			}
+			else
+				if (type == cfgVariable::IS_BOOLEAN) {
+					printf(" CONFIG:> failed to set (float) variable: '%s' - declared as boolean\n",
+						name);
+					return false;
+				}
+
+	printf(" CONFIG:> failed to set (float) variable: '%s' - undefined\n",
+		name);
+	return false;
+}
+
+bool nse::ConfigParser::get_value(const char* name, double* value) const
+{
+	cfgVariable var = get_variable(name);
+	cfgVariable::VAR_TYPE type = var.get_type();
+
+	if (type == cfgVariable::IS_INT)
+	{
+		int ival;
+		int status = var.get_value(&ival);
+		(*value) = (double)ival;
+		return (status > 0);
+	}
+	else
+		if (type == cfgVariable::IS_DOUBLE) return (var.get_value(value) > 0);
+		else
+			if (type == cfgVariable::IS_STRING) {
+				printf(" CONFIG:> failed to set (double) variable: '%s' - declared as string\n",
+					name);
+				return false;
+			}
+			else
+				if (type == cfgVariable::IS_BOOLEAN) {
+					printf(" CONFIG:> failed to set (double) variable: '%s' - declared as boolean\n",
+						name);
+					return false;
+				}
+
+	printf(" CONFIG:> failed to set (double) variable: '%s' - undefined\n",
+		name);
+	return false;
+}
+
+bool nse::ConfigParser::get_value(const char* name, char** value) const
+{
+	cfgVariable var = get_variable(name);
+	cfgVariable::VAR_TYPE type = var.get_type();
+
+	if (type == cfgVariable::IS_INT) {
+		printf(" CONFIG:> failed to set (string) variable: '%s' - declared as int\n",
+			name);
+		return false;
+	}
+	else
+		if (type == cfgVariable::IS_DOUBLE)
+		{
+			printf(" CONFIG:> failed to set (string) variable: '%s' - declared as double\n",
+				name);
+			return false;
+		}
+		else
+			if (type == cfgVariable::IS_STRING) return (var.get_value(value) > 0);
+			else
+				if (type == cfgVariable::IS_BOOLEAN)
+				{
+					printf(" CONFIG:> failed to set (string) variable: '%s' - declared as boolean\n",
+						name);
+					return false;
+				}
+
+	printf(" CONFIG:> failed to set (string) variable: '%s' - undefined\n",
+		name);
+	return false;
+}
+
+bool nse::ConfigParser::get_value(const char* name, std::string& value) const
+{
+	cfgVariable var = get_variable(name);
+	cfgVariable::VAR_TYPE type = var.get_type();
+
+	if (type == cfgVariable::IS_INT) {
+		printf(" CONFIG:> failed to set (string) variable: '%s' - declared as int\n",
+			name);
+		return false;
+	}
+	else
+		if (type == cfgVariable::IS_DOUBLE)
+		{
+			printf(" CONFIG:> failed to set (string) variable: '%s' - declared as double\n",
+				name);
+			return false;
+		}
+		else
+			if (type == cfgVariable::IS_STRING) return (var.get_value(value) > 0);
+			else
+				if (type == cfgVariable::IS_BOOLEAN)
+				{
+					printf(" CONFIG:> failed to set (string) variable: '%s' - declared as boolean\n",
+						name);
+					return false;
+				}
+
+	printf(" CONFIG:> failed to set (string) variable: '%s' - undefined\n",
+		name);
+	return false;
+}
+
+bool nse::ConfigParser::get_value(const char* name, bool* value) const
+{
+	cfgVariable var = get_variable(name);
+	cfgVariable::VAR_TYPE type = var.get_type();
+
+	if (type == cfgVariable::IS_INT)
+	{
+		printf(" CONFIG:> failed to set (bool) variable: '%s' - declared as int\n",
+			name);
+		return false;
+	}
+	else
+		if (type == cfgVariable::IS_DOUBLE)
+		{
+			printf(" CONFIG:> failed to set (bool) variable: '%s' - declared as double\n",
+				name);
+			return false;
+		}
+		else
+			if (type == cfgVariable::IS_STRING) {
+				printf(" CONFIG:> failed to set (bool) variable: '%s' - declared as string\n",
+					name);
+				return false;
+			}
+			else
+				if (type == cfgVariable::IS_BOOLEAN) return (var.get_value(value) > 0);
+
+	printf(" CONFIG:> failed to set (bool) variable: '%s'\n", name);
+	return false;
+}
+
+bool nse::ConfigParser::mpi_is_varname(const char* name, const MPI_Comm comm) const
+{
+	const int host_rank = 0;
+
+	int rank;
+	bool status;
+	MPI_Comm_rank(comm, &rank);
+
+	if (rank == host_rank) {
+		status = is_varname(name);
+	}
+	mpi_broadcast(&status, 1, host_rank, comm);
+	return status;
+}
+
+
+template< typename T >
+bool nse::ConfigParser::mpi_get_value(const char* name,
+	T* value, const MPI_Comm comm) const
+{
+	const int host_rank = 0;
+
+	int rank, status = 0;
+	MPI_Comm_rank(comm, &rank);
+
+	if (rank == host_rank) {
+		if (get_value(name, value)) status = 1;
+	}
+	mpi_broadcast(&status, 1, host_rank, comm);
+	if (status) mpi_broadcast(value, 1, host_rank, comm);
+
+	return (status == 1);
+}
+
+bool nse::ConfigParser::mpi_get_value(const char* name,
+	char** value, const MPI_Comm comm) const
+{
+	const int host_rank = 0;
+
+	int rank, status = 0;
+	MPI_Comm_rank(comm, &rank);
+
+	if (rank == host_rank) {
+		if (get_value(name, value)) status = strlen(*value) + 1;
+	}
+	mpi_broadcast(&status, 1, host_rank, comm);
+	if (status) {
+		if (rank != host_rank) (*value) = new char[status];
+		mpi_broadcast(*value, status, host_rank, comm);
+	}
+
+	return (status > 0);
+}
+
+bool nse::ConfigParser::mpi_get_value(const char* name,
+	std::string& value, const MPI_Comm comm) const
+{
+	char* c_value;
+	if (!mpi_get_value(name, &c_value, comm)) return false;
+
+	value = std::string(c_value);
+
+	delete[] c_value;
+	return true;
+}
+
+
+template bool nse::ConfigParser::mpi_get_value(const char* name,
+	int* value, const MPI_Comm comm) const;
+template bool nse::ConfigParser::mpi_get_value(const char* name,
+	float* value, const MPI_Comm comm) const;
+template bool nse::ConfigParser::mpi_get_value(const char* name,
+	double* value, const MPI_Comm comm) const;
+template bool nse::ConfigParser::mpi_get_value(const char* name,
+	bool* value, const MPI_Comm comm) const;
diff --git a/config-parser.h b/config-parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccea8184d09126a1f8ccf5d54dfad9f07eebc254
--- /dev/null
+++ b/config-parser.h
@@ -0,0 +1,142 @@
+#pragma once
+
+// [config-parser.h]: configuration file parser
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "mpi-com.h"
+#include "str-com.h"
+#include "cfg-var.h"
+
+namespace nse
+{
+	class ConfigParser {
+	public:
+		ConfigParser();
+		~ConfigParser();
+
+		bool run(const char* filename);
+		bool mpi_run(const char* filename, const MPI_Comm comm);
+		bool mpi_run(const char* filename);
+
+
+		bool is_varname(const char* name) const;
+		const cfgVariable get_variable(const int idx) const;
+		const cfgVariable get_variable(const char* name) const;
+
+		bool get_value(const char* name, int* value) const;
+		bool get_value(const char* name, float* value) const;
+		bool get_value(const char* name, double* value) const;
+
+		bool get_value(const char* name, char** value) const;
+		bool get_value(const char* name, std::string& value) const;
+
+		bool get_value(const char* name, bool* value) const;
+
+
+		bool mpi_is_varname(const char* name, const MPI_Comm comm) const;
+
+		template< typename T >
+		bool mpi_get_value(const char* name, T* value, const MPI_Comm comm) const;
+
+		bool mpi_get_value(const char* name, char** value, const MPI_Comm comm) const;
+		bool mpi_get_value(const char* name, std::string& value, const MPI_Comm comm) const;
+
+
+		void print() const;
+
+	private:	// datatypes //
+
+		enum LEXEME_TYPE {	// lexeme types //
+			IS_INVALID,
+			IS_NAME,
+			IS_ASSIGNMENT,
+			IS_BRACE_OPEN, IS_BRACE_CLOSE,			// {}
+			IS_BRACKET_OPEN, IS_BRACKET_CLOSE,		// ()
+			IS_SEMICOLON,
+			IS_VALUE,
+			IS_OP_ADD, IS_OP_SUB,				// +,-		[priority=1, assoc.=left]
+			IS_OP_MUL, IS_OP_DIV, IS_OP_MOD,	// *,/,%	[priority=2, assoc.=left]
+			IS_OP_PLUS, IS_OP_MINUS,			// +,-		[priority=3, assoc.=left]
+			IS_OP_EXP							// ^		[priority=4, assoc.=right]
+		};
+
+		enum OP_ASSOCIATIVITY { IS_OP_LEFT, IS_OP_RIGHT };	// operation specifiers //
+
+		struct parserState {	// helper struct to control parsing state //
+
+			parserState();
+			parserState(const parserState& state);
+			~parserState();
+
+			void truncate_name_space();
+			void append_name_space(const char* name);
+
+			int idx, level;		// lexeme index and namespace level 
+			char *name_space;	// current namespace
+
+		private:	// allocation data
+			int nalloc;
+			static const int c_alloc_init = 64;
+		};
+
+		struct rpnList {		// helper struct for RPN expressions evaluation //
+
+			rpnList();
+			~rpnList();
+
+			bool convert(
+				parserState& state,		// advancing state after delimiter [';']
+				const LEXEME_TYPE *lexeme_type, const FileParser& parser);
+
+		private:	// interface //
+			void init();
+			void add(const int idx);
+			bool empty();
+
+		public:
+			int *expr;		// lexeme index corresponding to element
+			int nexpr;		// number of elements in expression
+
+		private:	// allocation data
+			int nalloc;
+			static const int c_alloc_init = 64;
+		};
+
+	private:	// static //
+
+		static bool is_valid_name(const char* lexeme);	// valid variable name check //
+
+														// op - priority interface //
+		static bool is_op(const LEXEME_TYPE op);
+		static bool is_op_binary(const LEXEME_TYPE op);
+		static bool is_op_unary(const LEXEME_TYPE op);
+
+		static int op_priority(const LEXEME_TYPE op);
+		static OP_ASSOCIATIVITY op_associativity(const LEXEME_TYPE op);
+
+		static bool op_lt(const LEXEME_TYPE opA, const LEXEME_TYPE opB);
+
+	private:	// processing //
+
+		bool add(const cfgVariable& rvalue);	// adding variable to list //
+
+		const cfgVariable evaluate_rpn(
+			const rpnList& rpn,
+			parserState state,	// using copy due to namespace operations
+			const LEXEME_TYPE *lexeme_type, const FileParser& parser) const;
+
+		bool run_lexical_analysis(LEXEME_TYPE *lexeme_type,
+			const FileParser& parser);
+		bool run_syntax_analysis(const LEXEME_TYPE *lexeme_type,
+			const FileParser& parser);
+
+	private:	// data //	
+
+		int nalloc_vars;
+		static const int c_alloc_init = 64;
+
+		int nvars;
+		cfgVariable *var;
+	};
+}
diff --git a/config.txt b/config.txt
new file mode 100644
index 0000000000000000000000000000000000000000..758ab6985301435e90e0cdd31e26a6f4db1b2e3b
--- /dev/null
+++ b/config.txt
@@ -0,0 +1,165 @@
+domain
+{
+#
+# physical domain setup in H units, H - height
+#   x - streamwise, y - spanwise, z - wall-normal directions
+#
+	x = 0.0; y = 0.0; z = 0.0;			# point of origin
+	length = 6.0; width = 4.0; height = 1.0;	# domain length(x), width(y) and height(z)
+}
+# -----------------------------------------------------------------------------
+
+grid
+{
+	cx = 192; cy = 128; cz = 128;			# number of cells in each direction	
+	ksi_z = 1.5;					# near-wall grid stretching parameter
+}
+# -----------------------------------------------------------------------------
+
+mpi_setup
+{
+#
+# MPI-process distribution
+#   in 'mpirun -np [N]' [N] overrides config specs if differs
+#
+	dimx = 1; dimy = 1; dimz = 1;
+}
+# -----------------------------------------------------------------------------
+
+time
+{
+	begin = 0.0; end = 50.0;		# start and end time of integration
+						#   in H/Umax units, Umax - wall speed difference
+
+	CFL = 0.1;				# Courant-Friedrichs-Lewy number
+}
+# -----------------------------------------------------------------------------
+
+fluid
+{
+	Umax = 1.0;				# wall-speed difference in dimensionless units (do NOT change!)
+	disturbance_amp = 0.025;		# initial disturbance relative to Umax
+
+	density = 1.0;				# fictitious density value (do NOT change!)
+	viscosity = 1.0 / 5200.0;		# fictitious viscosity value <=> inverse of Reynolds number
+						#   Re = (Umax * H) / nu
+
+	T0 = 1.0;				# bottom wall temperature
+	TH = 2.0;				# top wall temperature
+
+	Prandtl = 0.7;				# molecular Prandtl number
+						#   Pr = nu / kappa
+
+	Richardson = 0.01;			# final Richardson number
+	Richardson_init = 0.00;			# initial Richardson number
+						#   Ri = g * (TH - T0) / T0 * (H / Umax^2)
+
+	T_gravity_init = 200.0;			# start time for increasing Ri
+	T_gravity_period = 100.0;		# time step for increasing Ri
+}
+# -----------------------------------------------------------------------------
+
+# particle simulation setup
+#  used only if INCLUDE_PARTICLES is defined
+ptcl_opt
+{
+	n = 262144;#3145728;#786432;	# number of particles to release
+	begin = 0.0;				# release time
+}
+# ----------------------------------------------------------------------
+
+# particle tracking setup
+#  used only if INCLUDE_PARTICLES_TRACKING is defined
+ptcl_track_opt
+{
+	n = 128;		# number of particles to release
+	begin = 1600.0;		# release time
+
+	group_max_size = 256;			# max number of particles per group
+	max_memory = 10 * 1024 * 1024;		# max memory in bytes for keeping trajectories in memory	
+}
+# ----------------------------------------------------------------------
+
+poisson
+{
+#
+# Poisson equation solver setup
+#
+	retol = 0.0001; abstol = 0.001;		# relative and absolute tolerance
+	miniters = 1; maxiters = 500;		# minimum and maximum number of iterations
+
+	piters = 1;				# number of preconditioner (multigrid) iterations
+
+	multigrid 
+	{
+		ngrid = 7;			# number of grids in multigrid sequence (= [0] - auto definition)
+
+		down_iters = 2;			# number of smoother iterations on fine->coarse traverse
+		up_iters = 3;			# number of smoother iterations on coarse->fine traverse
+		direct_iters = 5;		# number of smoother iterations on coarsest grid
+
+		smooth_up_omega = 1.84;		# relaxation value on coarse->fine traverse
+		smooth_up_omega_fine = 1.64;	# relaxation value on reaching finest grid
+	}
+}
+# -----------------------------------------------------------------------------
+
+output 
+{
+	DIR = "output/";		# output directory
+
+	begin = 1800.0;			# start time for writing output 
+	dt = 200.0;			# time step for writing output
+
+	xmin = domain.x; xmax = domain.x + domain.length;	# -x output domain setup
+	ymin = domain.y; ymax = domain.y + domain.width;	# -y output domain setup
+	zmin = domain.z; zmax = domain.z + domain.height;	# -z output domain setup
+
+	regular_plt3d_cntrl = false;			# flag for regular output of 3D .plt fields
+	regular_bin3d_cntrl = false;			# flag for regular output of 3D .nsx fields
+	final_plt3d_cntrl = false;			# flag for final output of 3D .plt fields
+
+	nscreen = 400;					# onscreen output period in time steps
+
+	profiles 
+	{
+		DIR = "output-rapid/";			# rapid profile output directory
+
+		begin = 2000.0; end = 2100.0;		# start and end time for writing profiles
+		dt = 2.0;				# time step for writing profiles
+
+		plt_cntrl = true;			# flag for output of .plt profiles
+	}
+}
+# -----------------------------------------------------------------------------
+
+dump 
+{
+	DIR = "dump/";			# dump directory
+
+	begin = 400.0;			# start time for writing model dump
+	dt = 400.0;			# time step for writing model dump
+}
+# -----------------------------------------------------------------------------
+
+startup
+{
+	DIR = "init/";			# initial conditions directory
+}
+# -----------------------------------------------------------------------------
+
+stats
+{
+	begin = 1600.0;			# start time for statistics gathering
+	time_mod = 10;			# statistics gathering period in time steps
+
+	output {
+		DIR = output.DIR + "stat/";		# statistics output directory
+	}
+
+	dump {
+		DIR = dump.DIR + "stat/";		# statistics dump directory
+	}
+}
+# -----------------------------------------------------------------------------
+
diff --git a/grid-common2d.h b/grid-common2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..b996cc103a24cbbd5071f350c48fa58cbf04bc1a
--- /dev/null
+++ b/grid-common2d.h
@@ -0,0 +1,956 @@
+#pragma once
+
+#include <string.h>
+#include "nse-sys.h"
+#include "nse-alloc.h"
+
+#ifndef EXCLUDE_GPU_BRANCH
+#include "grid-common2d.cuh"
+#include "cuda-stx.cuh"
+#endif
+
+namespace nse
+{
+	// * null halo cells * //
+	// ------------------- //
+	template< memType mem = memCPU, typename T >
+	void null_ghost_halo(T* _RESTRICT x,
+		const int nx, const int ny,
+		const int gcx, const int gcy);
+
+	template< memType mem = memCPU, typename T >
+	void null_halo(T* _RESTRICT x,
+		const int nx, const int ny,
+		const int ib, const int ie, const int jb, const int je);
+
+	// * apply -x, -y periodicity * //
+	// ---------------------------- //
+	template< memType mem = memCPU, typename T >
+	void apply_periodic_x(T* _RESTRICT x, const int nx, const int ny,
+		const int gcx, const int gcy, const int hx, const int hy);
+	template< memType mem = memCPU, typename T >
+	void apply_periodic_y(T* _RESTRICT x, const int nx, const int ny,
+		const int gcx, const int gcy, const int hx, const int hy);
+
+	// * apply -x, -y periodicity - colored * //
+	// -------------------------------------- //
+	template< memType mem = memCPU, typename T >
+	void apply_periodic_x(T* _RESTRICT x, const int color,
+		const int nx, const int ny,
+		const int gcx, const int gcy, const int hx, const int hy);
+	template< memType mem = memCPU, typename T >
+	void apply_periodic_y(T* _RESTRICT x, const int color,
+		const int nx, const int ny,
+		const int gcx, const int gcy, const int hx, const int hy);
+
+	// * get 2d sub array //
+	// ------------------ //
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void get_sub_array(const T* _RESTRICT const in,
+		const int nx, const int ny,
+		const int ib, const int ie, const int jb, const int je,
+		T* _RESTRICT out);
+
+	// * put 2d sub array //
+	// ------------------ //
+	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+	void put_sub_array(T* _RESTRICT out,
+		const int nx, const int ny,
+		const int ib, const int ie, const int jb, const int je,
+		const T* _RESTRICT const in);
+
+	// * get 2d sub array - colored //
+	// ---------------------------- //
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void get_sub_array(const T* _RESTRICT const in, const int color,
+		const int nx, const int ny,
+		const int ib, const int ie, const int jb, const int je,
+		T* _RESTRICT out);
+
+	// * put 2d sub array - colored //
+	// ---------------------------- //
+	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+	void put_sub_array(T* _RESTRICT out, const int color,
+		const int nx, const int ny,
+		const int ib, const int ie, const int jb, const int je,
+		const T* _RESTRICT const in);
+
+	// * copy 2d sub array //
+	// ------------------- //
+	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+	void copy_sub_array(T* _RESTRICT out,
+		const int nx, const int ny,
+		const int posx, const int posy,
+		const T* _RESTRICT const in,
+		const int subnx, const int subny,
+		const int ib, const int ie,
+		const int jb, const int je);
+
+
+	// * get number of colored elements //
+	// -------------------------------- //
+	int get_num_colored(const int color,
+		const int ib, const int ie, const int jb, const int je);
+}
+
+namespace nse
+{
+	// * null halo cells * //
+	// ------------------- //
+	template< typename T >
+	void null_ghost_halo_omp(T* _RESTRICT x,
+		const int nx, const int ny,
+		const int gcx, const int gcy);
+
+	template< typename T >
+	void null_halo_omp(T* _RESTRICT x,
+		const int nx, const int ny,
+		const int ib, const int ie, const int jb, const int je);
+
+	// * apply -x, -y periodicity * //
+	// ---------------------------- //
+	template< typename T >
+	void apply_periodic_x_omp(T* _RESTRICT x, const int nx, const int ny,
+		const int gcx, const int gcy, const int hx, const int hy);
+	template< typename T >
+	void apply_periodic_y_omp(T* _RESTRICT x, const int nx, const int ny,
+		const int gcx, const int gcy, const int hx, const int hy);
+
+	// * apply -x, -y periodicity - colored * //
+	// -------------------------------------- //
+	template< typename T >
+	void apply_periodic_x_omp(T* _RESTRICT x, const int color,
+		const int nx, const int ny,
+		const int gcx, const int gcy, const int hx, const int hy);
+	template< typename T >
+	void apply_periodic_y_omp(T* _RESTRICT x, const int color,
+		const int nx, const int ny,
+		const int gcx, const int gcy, const int hx, const int hy);
+
+
+	// * get 2d sub array //
+	// ------------------ //
+	template< typename T >
+	void get_sub_array_omp(const T* _RESTRICT const in,
+		const int nx, const int ny,
+		const int ib, const int ie, const int jb, const int je,
+		T* _RESTRICT out);
+
+	// * put 2d sub array //
+	// ------------------ //
+	template< typename T >
+	void put_sub_array_omp(T* _RESTRICT out,
+		const int nx, const int ny,
+		const int ib, const int ie, const int jb, const int je,
+		const T* _RESTRICT const in);
+
+	// * get 2d sub array - colored //
+	// ---------------------------- //
+	template< typename T >
+	void get_sub_array_omp(const T* _RESTRICT const in, const int color,
+		const int nx, const int ny,
+		const int ib, const int ie, const int jb, const int je,
+		T* _RESTRICT out);
+
+	// * put 2d sub array - colored //
+	// ---------------------------- //
+	template< typename T >
+	void put_sub_array_omp(T* _RESTRICT out, const int color,
+		const int nx, const int ny,
+		const int ib, const int ie, const int jb, const int je,
+		const T* _RESTRICT const in);
+
+	// * copy 2d sub array //
+	// ------------------- //
+	template< typename T >
+	void copy_sub_array_omp(T* _RESTRICT out,
+		const int nx, const int ny,
+		const int posx, const int posy,
+		const T* _RESTRICT const in,
+		const int subnx, const int subny,
+		const int ib, const int ie,
+		const int jb, const int je);
+}
+
+
+// * implementation: null halo cells * //
+// ----------------------------------- //
+template< nse::memType mem, typename T >
+inline void nse::null_ghost_halo(
+	T* _RESTRICT x,
+	const int nx, const int ny,
+	const int gcx, const int gcy)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::null_ghost_halo(x, nx, ny, gcx, gcy);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			null_ghost_halo_omp(x, nx, ny, gcx, gcy);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				null_ghost_halo_omp(x, nx, ny, gcx, gcy);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::null_ghost_halo_omp(
+	T* _RESTRICT x,
+	const int nx, const int ny,
+	const int gcx, const int gcy)
+{
+	int i, j, idx;
+
+	// null column: west //
+#pragma omp for nowait
+	for (i = 0; i < gcx; i++) {
+		idx = i * ny;
+		for (j = 0; j < ny; j++)
+			x[idx + j] = (T)0;
+	}
+
+	// null column: east //
+#pragma omp for nowait
+	for (i = 0; i < gcx; i++) {
+		idx = (nx - gcx + i) * ny;
+		for (j = 0; j < ny; j++) {
+			x[idx + j] = (T)0;
+		}
+	}
+
+	// null rows: south //
+#pragma omp for nowait
+	for (i = gcx; i < nx - gcx; i++) {
+		idx = i * ny;
+		for (j = 0; j < gcy; j++)
+			x[idx + j] = (T)0;
+	}
+
+	// null rows: north //
+#pragma omp for nowait
+	for (i = gcx; i < nx - gcx; i++) {
+		idx = i * ny;
+		for (j = ny - gcy; j < ny; j++)
+			x[idx + j] = (T)0;
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::null_halo(
+	T* _RESTRICT x,
+	const int nx, const int ny,
+	const int ib, const int ie, const int jb, const int je)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) nse_gpu::null_halo(x, nx, ny, ib, ie, jb, je);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			null_halo_omp(x, nx, ny, ib, ie, jb, je);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				null_halo_omp(x, nx, ny, ib, ie, jb, je);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::null_halo_omp(
+	T* _RESTRICT x,
+	const int nx, const int ny,
+	const int ib, const int ie, const int jb, const int je)
+{
+	int i, j, idx;
+
+	// null column: west //
+#pragma omp for nowait
+	for (i = 0; i < ib; i++) {
+		idx = i * ny;
+		for (j = 0; j < ny; j++)
+			x[idx + j] = (T)0;
+	}
+
+	// null column: east //
+#pragma omp for nowait
+	for (i = ie + 1; i < nx; i++) {
+		idx = i * ny;
+		for (j = 0; j < ny; j++) {
+			x[idx + j] = (T)0;
+		}
+	}
+
+	// null rows: south //
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++) {
+		idx = i * ny;
+		for (j = 0; j < jb; j++)
+			x[idx + j] = (T)0;
+	}
+
+	// null rows: north //
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++) {
+		idx = i * ny;
+		for (j = je + 1; j < ny; j++)
+			x[idx + j] = (T)0;
+	}
+}
+
+// * apply -x, -y periodicity * //
+// ---------------------------- //
+template< nse::memType mem, typename T >
+inline void nse::apply_periodic_x(T* _RESTRICT x, const int nx, const int ny,
+	const int gcx, const int gcy,
+	const int hx, const int hy)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::apply_periodic_x(x, nx, ny, gcx, gcy, hx, hy);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			apply_periodic_x_omp(x, nx, ny, gcx, gcy, hx, hy);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				apply_periodic_x_omp(x, nx, ny, gcx, gcy, hx, hy);
+			}
+		}
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::apply_periodic_y(T* _RESTRICT x, const int nx, const int ny,
+	const int gcx, const int gcy,
+	const int hx, const int hy)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::apply_periodic_y(x, nx, ny, gcx, gcy, hx, hy);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			apply_periodic_y_omp(x, nx, ny, gcx, gcy, hx, hy);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				apply_periodic_y_omp(x, nx, ny, gcx, gcy, hx, hy);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::apply_periodic_x_omp(T* _RESTRICT x, const int nx, const int ny,
+	const int gcx, const int gcy,
+	const int hx, const int hy)
+{
+	const int stride = (nx - 2 * gcx) * ny;
+	const int shx = hx * ny;
+	const int jb = gcy - hy, je = ny - gcy + hy - 1;
+	const int block_size = (je - jb + 1) * sizeof(T);
+
+	int i, j, idx;
+
+	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
+#pragma omp for nowait
+		for (i = gcx - hx; i < gcx; i++) {	// west periodicity //
+			idx = i * ny;
+			for (j = jb; j <= je; j++)
+				x[idx + j] = x[idx + stride + j];
+		}
+
+
+#pragma omp for nowait
+		for (i = gcx - hx; i < gcx; i++) {	// east periodicity //
+			idx = i * ny + stride + shx;
+			for (j = jb; j <= je; j++)
+				x[idx + j] = x[idx - stride + j];
+		}
+	}
+	else
+	{
+
+#pragma omp for nowait
+		for (i = gcx - hx; i < gcx; i++) {	// west periodicity //
+			idx = i * ny + jb;
+			memcpy(&x[idx], &x[idx + stride], block_size);
+		}
+
+#pragma omp for nowait
+		for (i = gcx - hx; i < gcx; i++) {	// east periodicity //
+			idx = i * ny + jb + shx;
+			memcpy(&x[idx + stride], &x[idx], block_size);
+		}
+	}
+}
+template< typename T >
+void nse::apply_periodic_y_omp(T* _RESTRICT x, const int nx, const int ny,
+	const int gcx, const int gcy,
+	const int hx, const int hy)
+{
+	const int stride = (ny - 2 * gcy);
+	const int ib = gcx - hx, ie = nx - gcx + hx - 1;
+
+	int i, j, idx;
+
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++) {	// south periodicity //
+		idx = i * ny;
+		for (j = gcy - hy; j < gcy; j++)
+			x[idx + j] = x[idx + stride + j];
+	}
+
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++) {	// north periodicity //
+		idx = i * ny;
+		for (j = ny - gcy; j < ny - gcy + hy; j++)
+			x[idx + j] = x[idx - stride + j];
+	}
+}
+
+// * apply -x, -y periodicity - colored * //
+// -------------------------------------- //
+template< nse::memType mem, typename T >
+inline void nse::apply_periodic_x(T* _RESTRICT x, const int color,
+	const int nx, const int ny,
+	const int gcx, const int gcy,
+	const int hx, const int hy)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::apply_periodic_x(x, color, nx, ny, gcx, gcy, hx, hy);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			apply_periodic_x_omp(x, color, nx, ny, gcx, gcy, hx, hy);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				apply_periodic_x_omp(x, color, nx, ny, gcx, gcy, hx, hy);
+			}
+		}
+	}
+}
+template< nse::memType mem, typename T >
+inline void nse::apply_periodic_y(T* _RESTRICT x, const int color,
+	const int nx, const int ny,
+	const int gcx, const int gcy,
+	const int hx, const int hy)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::apply_periodic_y(x, color, nx, ny, gcx, gcy, hx, hy);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			apply_periodic_y_omp(x, color, nx, ny, gcx, gcy, hx, hy);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				apply_periodic_y_omp(x, color, nx, ny, gcx, gcy, hx, hy);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::apply_periodic_x_omp(T* _RESTRICT x, const int color,
+	const int nx, const int ny,
+	const int gcx, const int gcy,
+	const int hx, const int hy)
+{
+	const int stride = (nx - 2 * gcx) * ny;
+	const int ish = hx + nx - 2 * gcx;
+	const int shx = hx * ny;
+	const int jb = gcy - hy, je = ny - gcy + hy - 1;
+
+	int i, j, idx, csh;
+
+#pragma omp for nowait
+	for (i = gcx - hx; i < gcx; i++) {	// west periodicity //
+		csh = (i + jb + color) & 1;
+		idx = i * ny;
+		for (j = jb + csh; j <= je; j += 2)
+			x[idx + j] = x[idx + stride + j];
+	}
+
+#pragma omp for nowait
+	for (i = gcx - hx; i < gcx; i++) {	// east periodicity //
+		csh = (i + ish + jb + color) & 1;
+		idx = i * ny + stride + shx;
+		for (j = jb + csh; j <= je; j += 2)
+			x[idx + j] = x[idx - stride + j];
+	}
+}
+template< typename T >
+void nse::apply_periodic_y_omp(T* _RESTRICT x, const int color,
+	const int nx, const int ny,
+	const int gcx, const int gcy,
+	const int hx, const int hy)
+{
+	const int stride = (ny - 2 * gcy);
+	const int ib = gcx - hx, ie = nx - gcx + hx - 1;
+
+	int i, j, idx, csh;
+
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++) {	// south periodicity //
+		idx = i * ny;
+		csh = (i + gcy - hy + color) & 1;
+		for (j = gcy - hy + csh; j < gcy; j += 2)
+			x[idx + j] = x[idx + stride + j];
+
+	}
+
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++) {	// north periodicity //
+		idx = i * ny;
+		csh = (i + ny - gcy + color) & 1;
+		for (j = ny - gcy + csh; j < ny - gcy + hy; j += 2)
+			x[idx + j] = x[idx - stride + j];
+
+	}
+}
+
+// * implementation: get 2d sub array * //
+// ------------------------------------ //
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::get_sub_array(const T* _RESTRICT const in,
+	const int nx, const int ny,
+	const int ib, const int ie, const int jb, const int je,
+	T* _RESTRICT out)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (((memIN == memCPU) && (memOUT == memGPU)) ||
+		((memIN == memGPU) && (memOUT == memCPU)))
+	{
+		const int c_size = (ie - ib + 1) * (je - jb + 1);
+
+		T* buf;
+		int buf_id;
+		if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
+		if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
+
+		get_sub_array<memIN, memIN>(in, nx, ny, ib, ie, jb, je, buf);
+		mcopy<memOUT, memIN>(out, buf, c_size);
+
+		if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
+		if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
+	}
+	else
+		if ((memIN == memGPU) && (memOUT == memGPU)) {
+			nse_gpu::get_sub_array(in, nx, ny, ib, ie, jb, je, out);
+		}
+		else
+#endif
+		{	// memCPU -> memCPU //
+			if (omp_in_parallel()) {
+				get_sub_array_omp(in, nx, ny, ib, ie, jb, je, out);
+			}
+			else
+			{
+#pragma omp parallel shared( out ) 
+				{
+					get_sub_array_omp(in, nx, ny, ib, ie, jb, je, out);
+				}
+			}
+		}
+}
+
+template< typename T >
+void nse::get_sub_array_omp(const T* _RESTRICT const in,
+	const int nx, const int ny,
+	const int ib, const int ie, const int jb, const int je,
+	T* _RESTRICT out)
+{
+	const int cy = je - jb + 1;
+	const int block_size = cy * sizeof(T);
+
+	int i, j, idx, odx;
+
+	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
+
+#pragma omp for nowait
+		for (i = ib; i <= ie; i++)
+		{
+			idx = i * ny;
+			odx = (i - ib) * cy - jb;
+			for (j = jb; j <= je; j++)
+				out[odx + j] = in[idx + j];
+		}
+	}
+	else
+	{
+#pragma omp for nowait
+		for (i = ib; i <= ie; i++) {
+			idx = i * ny + jb;
+			odx = (i - ib) * cy;
+			memcpy(&out[odx], &in[idx], block_size);
+		}
+	}
+}
+
+// * implementation: put 2d sub array * //
+// ------------------------------------ //
+template< nse::memType memOUT, nse::memType memIN, typename T >
+inline void nse::put_sub_array(T* _RESTRICT out,
+	const int nx, const int ny,
+	const int ib, const int ie, const int jb, const int je,
+	const T* _RESTRICT const in)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (((memIN == memCPU) && (memOUT == memGPU)) ||
+		((memIN == memGPU) && (memOUT == memCPU)))
+	{
+		const int c_size = (ie - ib + 1) * (je - jb + 1);
+
+		T* buf;
+		int buf_id;
+		if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
+		if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
+
+		mcopy<memOUT, memIN>(buf, in, c_size);
+		put_sub_array<memOUT, memOUT>(out, nx, ny, ib, ie, jb, je, buf);
+
+		if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
+		if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id);
+	}
+	else
+		if ((memIN == memGPU) && (memOUT == memGPU)) {
+			nse_gpu::put_sub_array(out, nx, ny, ib, ie, jb, je, in);
+		}
+		else
+#endif
+		{	// memCPU <- memCPU //
+			if (omp_in_parallel()) {
+				put_sub_array_omp(out, nx, ny, ib, ie, jb, je, in);
+			}
+			else
+			{
+#pragma omp parallel shared( out ) 
+				{
+					put_sub_array_omp(out, nx, ny, ib, ie, jb, je, in);
+				}
+			}
+		}
+}
+
+template< typename T >
+void nse::put_sub_array_omp(T* _RESTRICT out,
+	const int nx, const int ny,
+	const int ib, const int ie, const int jb, const int je,
+	const T* _RESTRICT const in)
+{
+	const int cy = je - jb + 1;
+	const int block_size = cy * sizeof(T);
+
+	int i, j, idx, odx;
+
+	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
+
+#pragma omp for nowait
+		for (i = ib; i <= ie; i++)
+		{
+			idx = (i - ib) * cy - jb;
+			odx = i * ny;
+			for (j = jb; j <= je; j++)
+				out[odx + j] = in[idx + j];
+		}
+	}
+	else
+	{
+#pragma omp for nowait
+		for (i = ib; i <= ie; i++) {
+			idx = (i - ib) * cy;
+			odx = i * ny + jb;
+			memcpy(&out[odx], &in[idx], block_size);
+		}
+	}
+}
+
+// * implementation: get 2d sub array - colored * //
+// ---------------------------------------------- //
+template< nse::memType memIN, nse::memType memOUT, typename T >
+void nse::get_sub_array(const T* _RESTRICT const in, const int color,
+	const int nx, const int ny,
+	const int ib, const int ie, const int jb, const int je,
+	T* _RESTRICT out)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (((memIN == memCPU) && (memOUT == memGPU)) ||
+		((memIN == memGPU) && (memOUT == memCPU)))
+	{
+		const int c_size = get_num_colored(color, ib, ie, jb, je);
+
+		T* buf;
+		int buf_id;
+		if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
+		if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
+
+		get_sub_array<memIN, memIN>(in, color, nx, ny, ib, ie, jb, je, buf);
+		mcopy<memOUT, memIN>(out, buf, c_size);
+
+		if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
+		if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
+	}
+	else
+		if ((memIN == memGPU) && (memOUT == memGPU)) {
+			nse_gpu::get_sub_array(in, color, nx, ny, ib, ie, jb, je, out);
+		}
+		else
+#endif
+		{	// memCPU -> memCPU //
+#ifdef USE_DEPRECATED_COLOR_CP
+			int i, j;
+			int idx, odx = 0;
+			int sh = (ib + jb + color) & 1;
+
+			for (i = ib; i <= ie; i++) {
+
+				idx = i * ny + jb + sh;
+				for (j = jb + sh; j <= je; j += 2, odx++, idx += 2) {
+					out[odx] = in[idx];
+				}
+				sh = !sh;       // change shift for next column //
+			}
+#else
+
+			if (omp_in_parallel()) {
+				get_sub_array_omp(in, color, nx, ny, ib, ie, jb, je, out);
+			}
+			else
+			{
+#pragma omp parallel shared( out ) 
+				{
+					get_sub_array_omp(in, color, nx, ny, ib, ie, jb, je, out);
+				}
+			}
+#endif
+		}
+}
+
+template< typename T >
+void nse::get_sub_array_omp(const T* _RESTRICT const in, const int color,
+	const int nx, const int ny,
+	const int ib, const int ie, const int jb, const int je,
+	T* _RESTRICT out)
+{
+	int i, j, idx, odx, sh;
+
+	// colored column size //
+	const int cy = (je - jb + 1);
+	const int cyh = get_num_colored(color, // half-colored-column size
+		ib, ib, jb, je);
+
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++) {
+		sh = (i + jb + color) & 1;
+		idx = i * ny;
+		odx = ((i - ib) >> 1) * cy + ((i - ib) & 1) * cyh;
+		for (j = jb + sh; j <= je; j += 2, odx++) {
+			out[odx] = in[idx + j];
+		}
+	}
+}
+
+// * implementation: put 2d sub array - colored * //
+// ---------------------------------------------- //
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::put_sub_array(T* _RESTRICT out, const int color,
+	const int nx, const int ny,
+	const int ib, const int ie, const int jb, const int je,
+	const T* _RESTRICT const in)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (((memIN == memCPU) && (memOUT == memGPU)) ||
+		((memIN == memGPU) && (memOUT == memCPU)))
+	{
+		const int c_size = get_num_colored(color, ib, ie, jb, je);
+
+		T* buf;
+		int buf_id;
+		if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
+		if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
+
+		mcopy<memOUT, memIN>(buf, in, c_size);
+		put_sub_array<memOUT, memOUT>(out, color, nx, ny, ib, ie, jb, je, buf);
+
+		if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
+		if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id);
+	}
+	else
+		if ((memIN == memGPU) && (memOUT == memGPU)) {
+			nse_gpu::put_sub_array(out, color, nx, ny, ib, ie, jb, je, in);
+		}
+		else
+#endif
+		{	// memCPU <- memCPU //
+#ifdef USE_DEPRECATED_COLOR_CP
+			int i, j;
+			int idx, odx = 0;
+			int sh = (ib + jb + color) & 1;
+
+			for (i = ib; i <= ie; i++) {
+
+				idx = i * ny + jb + sh;
+				for (j = jb + sh; j <= je; j += 2, odx++, idx += 2) {
+					out[idx] = in[odx];
+				}
+				sh = !sh;       // change shift for next column //
+			}
+#else
+
+			if (omp_in_parallel()) {
+				put_sub_array_omp(out, color, nx, ny, ib, ie, jb, je, in);
+			}
+			else
+			{
+#pragma omp parallel shared( out ) 
+				{
+					put_sub_array_omp(out, color, nx, ny, ib, ie, jb, je, in);
+				}
+			}
+#endif
+		}
+}
+
+template< typename T >
+void nse::put_sub_array_omp(T* _RESTRICT out, const int color,
+	const int nx, const int ny,
+	const int ib, const int ie, const int jb, const int je,
+	const T* _RESTRICT const in)
+{
+	int i, j, idx, odx, sh;
+
+	// colored column size //
+	const int cy = (je - jb + 1);
+	const int cyh = get_num_colored(color, // half-colored-column size
+		ib, ib, jb, je);
+
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++) {
+		sh = (i + jb + color) & 1;
+		idx = ((i - ib) >> 1) * cy + ((i - ib) & 1) * cyh;
+		odx = i * ny;
+		for (j = jb + sh; j <= je; j += 2, idx++) {
+			out[odx + j] = in[idx];
+		}
+	}
+}
+
+// * implementation: copy 2d sub array * //
+// ------------------------------------ //
+template< nse::memType memOUT, nse::memType memIN, typename T >
+inline void nse::copy_sub_array(T* _RESTRICT out,
+	const int nx, const int ny,
+	const int posx, const int posy,
+	const T* _RESTRICT const in,
+	const int subnx, const int subny,
+	const int ib, const int ie,
+	const int jb, const int je)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (((memIN == memCPU) && (memOUT == memGPU)) ||
+		((memIN == memGPU) && (memOUT == memCPU)))
+	{
+		const int c_size = (ie - ib + 1) * (je - jb + 1);
+
+		T* buf;
+		int buf_id;
+		if (memOUT == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
+		if (memOUT == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
+
+		get_sub_array<memIN, memOUT>(in, subnx, subny, ib, ie, jb, je, buf);
+		put_sub_array<memOUT, memOUT>(out, nx, ny,
+			posx, posx + (ie - ib), posy, posy + (je - jb), buf);
+
+		if (memOUT == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
+		if (memOUT == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
+	}
+	else
+		if ((memIN == memGPU) && (memOUT == memGPU)) {
+			nse_gpu::copy_sub_array(out, nx, ny, posx, posy,
+				in, subnx, subny, ib, ie, jb, je);
+		}
+		else
+#endif
+		{	// memCPU <- memCPU //
+			if (omp_in_parallel()) {
+				copy_sub_array_omp(out, nx, ny, posx, posy,
+					in, subnx, subny, ib, ie, jb, je);
+			}
+			else
+			{
+#pragma omp parallel shared( out ) 
+				{
+					copy_sub_array_omp(out, nx, ny, posx, posy,
+						in, subnx, subny, ib, ie, jb, je);
+				}
+			}
+		}
+}
+
+template< typename T >
+void nse::copy_sub_array_omp(T* _RESTRICT out,
+	const int nx, const int ny,
+	const int posx, const int posy,
+	const T* _RESTRICT const in,
+	const int subnx, const int subny,
+	const int ib, const int ie,
+	const int jb, const int je)
+{
+	const int block_size = (je - jb + 1) * sizeof(T);
+	const int shx = posx - ib;
+
+	int i, idx, odx;
+
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++) {
+		odx = (i + shx) * ny + posy;
+		idx = i * subny + jb;
+
+		memcpy(&out[odx], &in[idx], block_size);
+	}
+}
+
+// * implementation: get number of colored elements //
+// ------------------------------------------------ //
+inline int nse::get_num_colored(const int color,
+	const int ib, const int ie, const int jb, const int je)
+{
+	const int length = ie - ib + 1;
+	const int width = je - jb + 1;
+
+	if ((length & 1) && (width & 1)) {
+
+		const int sh = !((ib + jb + color) & 1);
+		return ((length - 1) >> 1) * width + ((width + sh) >> 1);
+	}
+	else
+		return ((length * width) >> 1);
+}
diff --git a/grid-common3d.h b/grid-common3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..d809bef5d52910c00beae6bbd6ddebe3020e45ea
--- /dev/null
+++ b/grid-common3d.h
@@ -0,0 +1,1521 @@
+#pragma once
+
+#include <string.h>
+#include "nse-sys.h"
+#include "nse-alloc.h"
+
+#ifndef EXCLUDE_GPU_BRANCH
+#include "grid-common3d.cuh"
+#include "cuda-stx.cuh"
+#endif
+
+namespace nse
+{
+	// * null halo cells * //
+	// ------------------- //
+	template< memType mem = memCPU, typename T >
+	void null_ghost_halo(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz);
+
+	template< memType mem = memCPU, typename T >
+	void null_halo(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke);
+
+	// * apply -x, -y, -z periodicity * //
+	// -------------------------------- //
+	template< memType mem = memCPU, typename T >
+	void apply_periodic_x(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+	template< memType mem = memCPU, typename T >
+	void apply_periodic_y(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+	template< memType mem = memCPU, typename T >
+	void apply_periodic_z(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+
+	// * apply -x, -y, -z periodicity - colored * //
+	// ------------------------------------------ //
+	template< memType mem = memCPU, typename T >
+	void apply_periodic_x(T* _RESTRICT x, const int color,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+	template< memType mem = memCPU, typename T >
+	void apply_periodic_y(T* _RESTRICT x, const int color,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+	template< memType mem = memCPU, typename T >
+	void apply_periodic_z(T* _RESTRICT x, const int color,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+
+	// * get 3d sub array //
+	// ------------------ //
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void get_sub_array(const T* _RESTRICT const in,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+		T* _RESTRICT out);
+
+	// * put 3d sub array //
+	// ------------------ //
+	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+	void put_sub_array(T* _RESTRICT out,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+		const T* _RESTRICT const in);
+
+	// * get 3d sub array - colored //
+	// ---------------------------- //
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void get_sub_array(const T* _RESTRICT const in, const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+		T* _RESTRICT out);
+
+	// * put 3d sub array - colored //
+	// ---------------------------- //
+	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+	void put_sub_array(T* _RESTRICT out, const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+		const T* _RESTRICT const in);
+
+	// * copy 3d sub array //
+	// ------------------- //
+	template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+	void copy_sub_array(T* _RESTRICT out,
+		const int nx, const int ny, const int nz,
+		const int posx, const int posy, const int posz,
+		const T* _RESTRICT const in,
+		const int subnx, const int subny, const int subnz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke);
+
+
+	// * get number of colored elements //
+	// -------------------------------- //
+	int get_num_colored(const int color,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke);
+}
+
+
+namespace nse
+{
+	// * null halo cells * //
+	// ------------------- //
+	template< typename T >
+	void null_ghost_halo_omp(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz);
+
+	template< typename T >
+	void null_halo_omp(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke);
+
+	// * apply -x, -y, -z periodicity * //
+	// -------------------------------- //
+	template< typename T >
+	void apply_periodic_x_omp(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+	template< typename T >
+	void apply_periodic_y_omp(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+	template< typename T >
+	void apply_periodic_z_omp(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+
+	// * apply -x, -y, -z periodicity - colored * //
+	// ------------------------------------------ //
+	template< typename T >
+	void apply_periodic_x_omp(T* _RESTRICT x, const int color,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+	template< typename T >
+	void apply_periodic_y_omp(T* _RESTRICT x, const int color,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+	template< typename T >
+	void apply_periodic_z_omp(T* _RESTRICT x, const int color,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int hx, const int hy, const int hz);
+
+	// * get 3d sub array //
+	// ------------------ //
+	template< typename T >
+	void get_sub_array_omp(const T* _RESTRICT const in,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+		T* _RESTRICT out);
+
+	// * put 3d sub array //
+	// ------------------ //
+	template< typename T >
+	void put_sub_array_omp(T* _RESTRICT out,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+		const T* _RESTRICT const in);
+
+	// * get 3d sub array - colored //
+	// ---------------------------- //
+	template< typename T >
+	void get_sub_array_omp(const T* _RESTRICT const in, const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+		T* _RESTRICT out);
+
+	// * put 3d sub array - colored //
+	// ---------------------------- //
+	template< typename T >
+	void put_sub_array_omp(T* _RESTRICT out, const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+		const T* _RESTRICT const in);
+
+	// * copy 3d sub array //
+	// ------------------- //
+	template< typename T >
+	void copy_sub_array_omp(T* _RESTRICT out,
+		const int nx, const int ny, const int nz,
+		const int posx, const int posy, const int posz,
+		const T* _RESTRICT const in,
+		const int subnx, const int subny, const int subnz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke);
+}
+
+
+
+// * implementation: null halo cells * //
+// ----------------------------------- //
+template< nse::memType mem, typename T >
+inline void nse::null_ghost_halo(
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::null_ghost_halo(x, nx, ny, nz, gcx, gcy, gcz);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			null_ghost_halo_omp(x, nx, ny, nz, gcx, gcy, gcz);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				null_ghost_halo_omp(x, nx, ny, nz, gcx, gcy, gcz);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::null_ghost_halo_omp(
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx;
+
+	// -x null halo: west
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = 0; i < gcx; i++) {
+		for (j = 0; j < ny; j++) {
+			idx = i * nyz + j * nz;
+			for (k = 0; k < nz; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+
+	// -x null halo: east
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = 0; i < gcx; i++) {
+		for (j = 0; j < ny; j++) {
+			idx = (nx - gcx + i) * nyz + j * nz;
+			for (k = 0; k < nz; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+
+	// -y null halo: south //
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = gcx; i < nx - gcx; i++) {
+		for (j = 0; j < gcy; j++) {
+			idx = i * nyz + j * nz;
+			for (k = 0; k < nz; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+
+	// -y null halo: north //
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = gcx; i < nx - gcx; i++) {
+		for (j = ny - gcy; j < ny; j++) {
+			idx = i * nyz + j * nz;
+			for (k = 0; k < nz; k++)
+				x[idx + k] = (T)0;
+		}
+
+	}
+
+	// -z null halo: bottom //
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = gcx; i < nx - gcx; i++) {
+		for (j = gcy; j < ny - gcy; j++) {
+			idx = i * nyz + j * nz;
+			for (k = 0; k < gcz; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+
+	// -z null halo: top //
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = gcx; i < nx - gcx; i++) {
+		for (j = gcy; j < ny - gcy; j++) {
+			idx = i * nyz + j * nz;
+			for (k = nz - gcz; k < nz; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+}
+
+// * implementation: null halo cells * //
+// ----------------------------------- //
+template< nse::memType mem, typename T >
+inline void nse::null_halo(
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::null_halo(x, nx, ny, nz, ib, ie, jb, je, kb, ke);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			null_halo_omp(x, nx, ny, nz,
+				ib, ie, jb, je, kb, ke);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				null_halo_omp(x, nx, ny, nz,
+					ib, ie, jb, je, kb, ke);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::null_halo_omp(
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx;
+
+	// -x null halo: west
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = 0; i < ib; i++) {
+		for (j = 0; j < ny; j++) {
+			idx = i * nyz + j * nz;
+			for (k = 0; k < nz; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+
+	// -x null halo: east
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ie + 1; i < nx; i++) {
+		for (j = 0; j < ny; j++) {
+			idx = i * nyz + j * nz;
+			for (k = 0; k < nz; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+
+	// -y null halo: south //
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {
+		for (j = 0; j < jb; j++) {
+			idx = i * nyz + j * nz;
+			for (k = 0; k < nz; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+
+	// -y null halo: north //
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {
+		for (j = je + 1; j < ny; j++) {
+			idx = i * nyz + j * nz;
+			for (k = 0; k < nz; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+
+	// -z null halo: bottom //
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {
+		for (j = jb; j <= je; j++) {
+			idx = i * nyz + j * nz;
+			for (k = 0; k < kb; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+
+	// -z null halo: top //
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {
+		for (j = jb; j <= je; j++) {
+			idx = i * nyz + j * nz;
+			for (k = ke + 1; k < nz; k++)
+				x[idx + k] = (T)0;
+		}
+	}
+}
+
+// * apply -x, -y, -z periodicity * //
+// -------------------------------- //
+template< nse::memType mem, typename T >
+inline void nse::apply_periodic_x(T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::apply_periodic_x(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			apply_periodic_x_omp(x, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				apply_periodic_x_omp(x, nx, ny, nz,
+					gcx, gcy, gcz, hx, hy, hz);
+			}
+		}
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::apply_periodic_y(T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::apply_periodic_y(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			apply_periodic_y_omp(x, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				apply_periodic_y_omp(x, nx, ny, nz,
+					gcx, gcy, gcz, hx, hy, hz);
+			}
+		}
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::apply_periodic_z(T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::apply_periodic_z(x, nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			apply_periodic_z_omp(x, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				apply_periodic_z_omp(x, nx, ny, nz,
+					gcx, gcy, gcz, hx, hy, hz);
+			}
+		}
+	}
+}
+
+
+template< typename T >
+void nse::apply_periodic_x_omp(T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+	const int nyz = ny * nz;
+	const int stride = (nx - 2 * gcx) * nyz;
+	const int shx = hx * nyz;
+	const int jb = gcy - hy, je = ny - gcy + hy - 1;
+	const int kb = gcz - hz, ke = nz - gcz + hz - 1;
+	const int block_size = (ke - kb + 1) * sizeof(T);
+
+	int i, j, k, idx;
+
+	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = gcx - hx; i < gcx; i++) {	// west peridocity //
+			for (j = jb; j <= je; j++) {
+				idx = i * nyz + j * nz;
+				for (k = kb; k <= ke; k++)
+					x[idx + k] = x[idx + stride + k];
+			}
+		}
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = gcx - hx; i < gcx; i++) {	// east periodicity //
+			for (j = jb; j <= je; j++) {
+				idx = i * nyz + j * nz + stride + shx;
+				for (k = kb; k <= ke; k++)
+					x[idx + k] = x[idx - stride + k];
+			}
+		}
+	}
+	else
+	{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = gcx - hx; i < gcx; i++) {	// west peridocity //
+			for (j = jb; j <= je; j++) {
+				idx = i * nyz + j * nz + kb;
+				memcpy(&x[idx], &x[idx + stride], block_size);
+			}
+		}
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = gcx - hx; i < gcx; i++) {	// east peridocity //
+			for (j = jb; j <= je; j++) {
+				idx = i * nyz + j * nz + kb + shx;
+				memcpy(&x[idx + stride], &x[idx], block_size);
+			}
+		}
+	}
+}
+template< typename T >
+void nse::apply_periodic_y_omp(T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+	const int nyz = ny * nz;
+	const int stride = (ny - 2 * gcy) * nz;
+	const int shy = hy * nz;
+	const int ib = gcx - hx, ie = nx - gcx + hx - 1;
+	const int kb = gcz - hz, ke = nz - gcz + hz - 1;
+	const int block_size = (ke - kb + 1) * sizeof(T);
+
+	int i, j, k, idx;
+
+	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = ib; i <= ie; i++) {	// south periodicity //
+			for (j = gcy - hy; j < gcy; j++) {
+				idx = i * nyz + j * nz;
+				for (k = kb; k <= ke; k++)
+					x[idx + k] = x[idx + stride + k];
+			}
+		}
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = ib; i <= ie; i++) {	// north periodicity //
+			for (j = gcy - hy; j < gcy; j++) {
+				idx = i * nyz + j * nz + stride + shy;
+				for (k = kb; k <= ke; k++)
+					x[idx + k] = x[idx - stride + k];
+			}
+		}
+	}
+	else
+	{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = ib; i <= ie; i++) {	// south periodicity //
+			for (j = gcy - hy; j < gcy; j++) {
+				idx = i * nyz + j * nz + kb;
+				memcpy(&x[idx], &x[idx + stride], block_size);
+			}
+		}
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = ib; i <= ie; i++) {	// north periodicity //
+			for (j = gcy - hy; j < gcy; j++) {
+				idx = i * nyz + j * nz + kb + shy;
+				memcpy(&x[idx + stride], &x[idx], block_size);
+			}
+		}
+	}
+}
+template< typename T >
+void nse::apply_periodic_z_omp(T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+	const int nyz = ny * nz;
+	const int stride = (nz - 2 * gcz);
+	const int ib = gcx - hx, ie = nx - gcx + hx - 1;
+	const int jb = gcy - hy, je = ny - gcy + hy - 1;
+
+	int i, j, k, idx;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {	// bottom periodicity //
+		for (j = jb; j <= je; j++) {
+			idx = i * nyz + j * nz;
+			for (k = gcz - hz; k < gcz; k++)
+				x[idx + k] = x[idx + stride + k];
+		}
+	}
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {	// top periodicity //
+		for (j = jb; j <= je; j++) {
+			idx = i * nyz + j * nz;
+			for (k = nz - gcz; k < nz - gcz + hz; k++)
+				x[idx + k] = x[idx - stride + k];
+		}
+	}
+}
+
+// * apply -x, -y, -z periodicity - colored * //
+// ------------------------------------------ //
+template< nse::memType mem, typename T >
+inline void nse::apply_periodic_x(T* _RESTRICT x, const int color,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) {
+		nse_gpu::apply_periodic_x(x, color,
+			nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
+	}
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			apply_periodic_x_omp(x, color, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				apply_periodic_x_omp(x, color, nx, ny, nz,
+					gcx, gcy, gcz, hx, hy, hz);
+			}
+		}
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::apply_periodic_y(T* _RESTRICT x, const int color,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) {
+		nse_gpu::apply_periodic_y(x, color,
+			nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
+	}
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			apply_periodic_y_omp(x, color, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				apply_periodic_y_omp(x, color, nx, ny, nz,
+					gcx, gcy, gcz, hx, hy, hz);
+			}
+		}
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::apply_periodic_z(T* _RESTRICT x, const int color,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) {
+		nse_gpu::apply_periodic_z(x, color,
+			nx, ny, nz, gcx, gcy, gcz, hx, hy, hz);
+	}
+	else
+#endif
+	{	// memCPU //
+		if (omp_in_parallel()) {
+			apply_periodic_z_omp(x, color, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		else
+		{
+#pragma omp parallel shared( x ) 
+			{
+				apply_periodic_z_omp(x, color, nx, ny, nz,
+					gcx, gcy, gcz, hx, hy, hz);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::apply_periodic_x_omp(T* _RESTRICT x, const int color,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+	const int nyz = ny * nz;
+	const int stride = (nx - 2 * gcx) * nyz;
+	const int ish = hx + nx - 2 * gcx;
+	const int shx = hx * nyz;
+	const int jb = gcy - hy, je = ny - gcy + hy - 1;
+	const int kb = gcz - hz, ke = nz - gcz + hz - 1;
+
+	int i, j, k, idx, csh;
+
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = gcx - hx; i < gcx; i++) {			// west periodicity //
+		for (j = jb; j <= je; j++) {
+			csh = (i + j + kb + color) & 1;
+			idx = i * nyz + j * nz;
+			for (k = kb + csh; k <= ke; k += 2)
+				x[idx + k] = x[idx + stride + k];
+		}
+	}
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = gcx - hx; i < gcx; i++) {			// east periodicity //
+		for (j = jb; j <= je; j++) {
+			csh = (i + ish + j + kb + color) & 1;
+			idx = i * nyz + j * nz + stride + shx;
+			for (k = kb + csh; k <= ke; k += 2)
+				x[idx + k] = x[idx - stride + k];
+		}
+	}
+}
+template< typename T >
+void nse::apply_periodic_y_omp(T* _RESTRICT x, const int color,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+	const int nyz = ny * nz;
+	const int stride = (ny - 2 * gcy) * nz;
+	const int jsh = hy + ny - 2 * gcy;
+	const int shy = hy * nz;
+	const int ib = gcx - hx, ie = nx - gcx + hx - 1;
+	const int kb = gcz - hz, ke = nz - gcz + hz - 1;
+
+	int i, j, k, idx, csh;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {	// south periodicity //
+		for (j = gcy - hy; j < gcy; j++) {
+			csh = (i + j + kb + color) & 1;
+			idx = i * nyz + j * nz;
+			for (k = kb + csh; k <= ke; k += 2)
+				x[idx + k] = x[idx + stride + k];
+		}
+	}
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {	// north periodicity //
+		for (j = gcy - hy; j < gcy; j++) {
+			csh = (i + j + jsh + kb + color) & 1;
+			idx = i * nyz + j * nz + stride + shy;
+			for (k = kb + csh; k <= ke; k += 2)
+				x[idx + k] = x[idx - stride + k];
+		}
+	}
+}
+template< typename T >
+void nse::apply_periodic_z_omp(T* _RESTRICT x, const int color,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz)
+{
+	const int nyz = ny * nz;
+	const int stride = (nz - 2 * gcz);
+	const int ib = gcx - hx, ie = nx - gcx + hx - 1;
+	const int jb = gcy - hy, je = ny - gcy + hy - 1;
+
+	int i, j, k, idx, csh;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {	// bottom periodicity //
+		for (j = jb; j <= je; j++) {
+			idx = i * nyz + j * nz;
+			csh = (i + j + gcz - hz + color) & 1;
+			for (k = gcz - hz + csh; k < gcz; k += 2)
+				x[idx + k] = x[idx + stride + k];
+		}
+	}
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {	// top periodicity //
+		for (j = jb; j <= je; j++) {
+			idx = i * nyz + j * nz;
+			csh = (i + j + nz - gcz + color) & 1;
+			for (k = nz - gcz + csh; k < nz - gcz + hz; k += 2)
+				x[idx + k] = x[idx - stride + k];
+		}
+	}
+}
+
+// * implementation: get 3d sub array * //
+// ------------------------------------ //
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::get_sub_array(const T* _RESTRICT const in,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+	T* _RESTRICT out)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (((memIN == memCPU) && (memOUT == memGPU)) ||
+		((memIN == memGPU) && (memOUT == memCPU)))
+	{
+		const int c_size = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);
+
+		T* buf;
+		int buf_id;
+		if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
+		if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
+
+		get_sub_array<memIN, memIN>(in, nx, ny, nz,
+			ib, ie, jb, je, kb, ke, buf);
+		mcopy<memOUT, memIN>(out, buf, c_size);
+
+		if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
+		if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
+	}
+	else
+		if ((memIN == memGPU) && (memOUT == memGPU)) {
+			nse_gpu::get_sub_array(in, nx, ny, nz,
+				ib, ie, jb, je, kb, ke, out);
+		}
+		else
+#endif
+		{	// memCPU -> memCPU //
+			if (omp_in_parallel()) {
+				get_sub_array_omp(in, nx, ny, nz,
+					ib, ie, jb, je, kb, ke, out);
+			}
+			else
+			{
+#pragma omp parallel shared( out ) 
+				{
+					get_sub_array_omp(in, nx, ny, nz,
+						ib, ie, jb, je, kb, ke, out);
+				}
+			}
+		}
+}
+
+template< typename T >
+void nse::get_sub_array_omp(const T* _RESTRICT const in,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+	T* _RESTRICT out)
+{
+	const int nyz = ny * nz;
+	const int cy = je - jb + 1, cz = ke - kb + 1;
+	const int cyz = cy * cz;
+	const int block_size = cz * sizeof(T);
+
+	int i, j, k, idx, odx;
+
+	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = ib; i <= ie; i++) {
+			for (j = jb; j <= je; j++) {
+				idx = i * nyz + j * nz;
+				odx = (i - ib) * cyz + (j - jb) * cz - kb;
+				for (k = kb; k <= ke; k++)
+					out[odx + k] = in[idx + k];
+			}
+		}
+	}
+	else
+	{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = ib; i <= ie; i++) {
+			for (j = jb; j <= je; j++) {
+				idx = i * nyz + j * nz + kb;
+				odx = (i - ib) * cyz + (j - jb) * cz;
+				memcpy(&out[odx], &in[idx], block_size);
+			}
+		}
+	}
+}
+
+// * implementation: put 3d sub array * //
+// ------------------------------------ //
+template< nse::memType memOUT, nse::memType memIN, typename T >
+inline void nse::put_sub_array(T* _RESTRICT out,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+	const T* _RESTRICT const in)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (((memIN == memCPU) && (memOUT == memGPU)) ||
+		((memIN == memGPU) && (memOUT == memCPU)))
+	{
+		const int c_size = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);
+
+		T* buf;
+		int buf_id;
+		if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
+		if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
+
+		mcopy<memOUT, memIN>(buf, in, c_size);
+		put_sub_array<memOUT, memOUT>(out, nx, ny, nz,
+			ib, ie, jb, je, kb, ke, buf);
+
+		if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
+		if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id);
+	}
+	else
+		if ((memIN == memGPU) && (memOUT == memGPU)) {
+			nse_gpu::put_sub_array(out, nx, ny, nz,
+				ib, ie, jb, je, kb, ke, in);
+		}
+		else
+#endif
+		{	// memCPU <- memCPU 
+			if (omp_in_parallel()) {
+				put_sub_array_omp(out, nx, ny, nz,
+					ib, ie, jb, je, kb, ke, in);
+			}
+			else
+			{
+#pragma omp parallel shared( out ) 
+				{
+					put_sub_array_omp(out, nx, ny, nz,
+						ib, ie, jb, je, kb, ke, in);
+				}
+			}
+		}
+}
+
+
+template< typename T >
+void nse::put_sub_array_omp(T* _RESTRICT out,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+	const T* _RESTRICT const in)
+{
+	const int nyz = ny * nz;
+	const int cy = je - jb + 1, cz = ke - kb + 1;
+	const int cyz = cy * cz;
+	const int block_size = cz * sizeof(T);
+
+	int i, j, k, idx, odx;
+
+	if (block_size < MIN_MEMCPY_BLOCK) {     // magic number of inefficient memcpy()
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = ib; i <= ie; i++) {
+			for (j = jb; j <= je; j++) {
+				odx = i * nyz + j * nz;
+				idx = (i - ib) * cyz + (j - jb) * cz - kb;
+				for (k = kb; k <= ke; k++)
+					out[odx + k] = in[idx + k];
+			}
+		}
+	}
+	else
+	{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+#else
+#pragma omp for nowait
+#endif
+		for (i = ib; i <= ie; i++) {
+			for (j = jb; j <= je; j++) {
+				odx = i * nyz + j * nz + kb;
+				idx = (i - ib) * cyz + (j - jb) * cz;
+				memcpy(&out[odx], &in[idx], block_size);
+			}
+		}
+	}
+}
+
+// * implementation: get 3d sub array - colored * //
+// ---------------------------------------------- //
+template< nse::memType memIN, nse::memType memOUT, typename T >
+void nse::get_sub_array(const T* _RESTRICT const in, const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+	T* _RESTRICT out)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (((memIN == memCPU) && (memOUT == memGPU)) ||
+		((memIN == memGPU) && (memOUT == memCPU)))
+	{
+		const int c_size = get_num_colored(color, ib, ie, jb, je, kb, ke);
+
+		T* buf;
+		int buf_id;
+		if (memIN == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
+		if (memIN == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
+
+		get_sub_array<memIN, memIN>(in, color, nx, ny, nz,
+			ib, ie, jb, je, kb, ke, buf);
+		mcopy<memOUT, memIN>(out, buf, c_size);
+
+		if (memIN == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
+		if (memIN == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
+	}
+	else
+		if ((memIN == memGPU) && (memOUT == memGPU)) {
+			nse_gpu::get_sub_array(in, color, nx, ny, nz,
+				ib, ie, jb, je, kb, ke, out);
+		}
+		else
+#endif
+		{	// memCPU -> memCPU //
+#ifdef USE_DEPRECATED_COLOR_CP
+			const int nyz = ny * nz;
+
+			int i, j, k;
+			int idx, odx = 0;
+			int sh;
+
+			for (i = ib; i <= ie; i++)
+			{
+				idx = i * nyz + jb * nz;
+				sh = (i + jb + kb + color) & 1;
+
+				for (j = jb; j <= je; j++, idx += nz) {
+					for (k = kb + sh; k <= ke; k += 2, odx++) {
+						out[odx] = in[idx + k];
+					}
+					sh = !sh;	// change shift for next column //
+				}
+			}
+#else
+
+			if (omp_in_parallel()) {
+				get_sub_array_omp(in, color, nx, ny, nz,
+					ib, ie, jb, je, kb, ke, out);
+			}
+			else
+			{
+#pragma omp parallel shared( out ) 
+				{
+					get_sub_array_omp(in, color, nx, ny, nz,
+						ib, ie, jb, je, kb, ke, out);
+				}
+			}
+#endif
+		}
+}
+
+
+template< typename T >
+void nse::get_sub_array_omp(const T* _RESTRICT const in, const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+	T* _RESTRICT out)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx, odx, sh;
+
+	// colored plane and column sizes //
+	const int cz = (ke - kb + 1);
+	const int cyz = (je - jb + 1) * cz;
+	const int cyzh = get_num_colored(color,	// half-colored-plane size
+		ib, ib, jb, je, kb, ke);
+#ifdef USE_OPENMP_2D_CYCLE
+	const int czh = get_num_colored(color,	// half-colored-column size
+		ib, ib, jb, jb, kb, ke);
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			sh = (i + j + kb + color) & 1;
+			idx = i * nyz + j * nz;
+			odx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh +
+				((j - jb) >> 1) * cz + ((j - jb) & 1) * czh;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		sh = (i + jb + kb + color) & 1;
+		idx = i * nyz + jb * nz;
+		odx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh;
+		for (j = jb; j <= je; j++, idx += nz)
+		{
+#endif
+			for (k = kb + sh; k <= ke; k += 2, odx++) {
+				out[odx] = in[idx + k];
+			}
+
+#ifndef USE_OPENMP_2D_CYCLE
+			sh = !sh;	// change shift for next column //
+#endif
+		}
+	}
+}
+
+// * implementation: put 3d sub array - colored * //
+// ---------------------------------------------- //
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::put_sub_array(T* _RESTRICT out, const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+	const T* _RESTRICT const in)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (((memIN == memCPU) && (memOUT == memGPU)) ||
+		((memIN == memGPU) && (memOUT == memCPU)))
+	{
+		const int c_size = get_num_colored(color, ib, ie, jb, je, kb, ke);
+
+		T* buf;
+		int buf_id;
+		if (memIN == memCPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
+		if (memIN == memGPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
+
+		mcopy<memOUT, memIN>(buf, in, c_size);
+		put_sub_array<memOUT, memOUT>(out, color, nx, ny, nz,
+			ib, ie, jb, je, kb, ke, buf);
+
+		if (memIN == memCPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
+		if (memIN == memGPU) nse_gpu::cudaStx::free_host_buf(buf_id);
+	}
+	else
+		if ((memIN == memGPU) && (memOUT == memGPU)) {
+			nse_gpu::put_sub_array(out, color, nx, ny, nz,
+				ib, ie, jb, je, kb, ke, in);
+		}
+		else
+#endif
+		{	// memCPU <- memCPU //
+#ifdef USE_DEPRECATED_COLOR_CP
+			const int nyz = ny * nz;
+
+			int i, j, k;
+			int odx, idx = 0;
+			int sh;
+
+			for (i = ib; i <= ie; i++)
+			{
+				odx = i * nyz + jb * nz;
+				sh = (i + jb + kb + color) & 1;
+
+				for (j = jb; j <= je; j++, odx += nz) {
+					for (k = kb + sh; k <= ke; k += 2, idx++) {
+						out[odx + k] = in[idx];
+					}
+					sh = !sh;	// change shift for next column //
+				}
+			}
+#else
+
+			if (omp_in_parallel()) {
+				put_sub_array_omp(out, color, nx, ny, nz,
+					ib, ie, jb, je, kb, ke, in);
+			}
+			else
+			{
+#pragma omp parallel shared( out ) 
+				{
+					put_sub_array_omp(out, color, nx, ny, nz,
+						ib, ie, jb, je, kb, ke, in);
+				}
+			}
+#endif
+		}
+}
+
+
+template< typename T >
+void nse::put_sub_array_omp(T* _RESTRICT out, const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+	const T* _RESTRICT const in)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx, odx, sh;
+
+	// colored plane and column sizes //
+	const int cz = (ke - kb + 1);
+	const int cyz = (je - jb + 1) * cz;
+	const int cyzh = get_num_colored(color,	// half-colored-plane size
+		ib, ib, jb, je, kb, ke);
+#ifdef USE_OPENMP_2D_CYCLE
+	const int czh = get_num_colored(color,	// half-colored-column size
+		ib, ib, jb, jb, kb, ke);
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			sh = (i + j + kb + color) & 1;
+			odx = i * nyz + j * nz;
+			idx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh +
+				((j - jb) >> 1) * cz + ((j - jb) & 1) * czh;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		sh = (i + jb + kb + color) & 1;
+		odx = i * nyz + jb * nz;
+		idx = ((i - ib) >> 1) * cyz + ((i - ib) & 1) * cyzh;
+		for (j = jb; j <= je; j++, odx += nz)
+		{
+#endif
+			for (k = kb + sh; k <= ke; k += 2, idx++) {
+				out[odx + k] = in[idx];
+			}
+
+#ifndef USE_OPENMP_2D_CYCLE
+			sh = !sh;	// change shift for next column //
+#endif
+		}
+	}
+}
+
+// * implementation: copy 3d sub array * //
+// ------------------------------------ //
+template< nse::memType memOUT, nse::memType memIN, typename T >
+inline void nse::copy_sub_array(T* _RESTRICT out,
+	const int nx, const int ny, const int nz,
+	const int posx, const int posy, const int posz,
+	const T* _RESTRICT const in,
+	const int subnx, const int subny, const int subnz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (((memIN == memCPU) && (memOUT == memGPU)) ||
+		((memIN == memGPU) && (memOUT == memCPU)))
+	{
+		const int c_size = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);
+
+		T* buf;
+		int buf_id;
+		if (memOUT == memCPU) nse_gpu::cudaStx::get_host_buf(&buf, &buf_id, c_size);
+		if (memOUT == memGPU) nse_gpu::cudaStx::get_dev_buf(&buf, &buf_id, c_size);
+
+		get_sub_array<memIN, memOUT>(in, subnx, subny, subnz,
+			ib, ie, jb, je, kb, ke, buf);
+		put_sub_array<memOUT, memOUT>(out, nx, ny, nz,
+			posx, posx + (ie - ib),
+			posy, posy + (je - jb),
+			posz, posz + (ke - kb), buf);
+
+		if (memOUT == memCPU) nse_gpu::cudaStx::free_host_buf(buf_id);
+		if (memOUT == memGPU) nse_gpu::cudaStx::free_dev_buf(buf_id);
+	}
+	else
+		if ((memIN == memGPU) && (memOUT == memGPU)) {
+			nse_gpu::copy_sub_array(out, nx, ny, nz, posx, posy, posz,
+				in, subnx, subny, subnz, ib, ie, jb, je, kb, ke);
+		}
+		else
+#endif
+		{	// memCPU <- memCPU //
+			if (omp_in_parallel()) {
+				copy_sub_array_omp(out, nx, ny, nz, posx, posy, posz,
+					in, subnx, subny, subnz, ib, ie, jb, je, kb, ke);
+			}
+			else
+			{
+#pragma omp parallel shared( out ) 
+				{
+					copy_sub_array_omp(out, nx, ny, nz, posx, posy, posz,
+						in, subnx, subny, subnz, ib, ie, jb, je, kb, ke);
+				}
+			}
+		}
+}
+
+template< typename T >
+void nse::copy_sub_array_omp(T* _RESTRICT out,
+	const int nx, const int ny, const int nz,
+	const int posx, const int posy, const int posz,
+	const T* _RESTRICT const in,
+	const int subnx, const int subny, const int subnz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke)
+{
+	const int nyz = ny * nz;
+	const int subnyz = subny * subnz;
+	const int block_size = (ke - kb + 1) * sizeof(T);
+	const int shx = posx - ib, shy = posy - jb;
+
+	int i, j, idx, odx;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+#else
+#pragma omp for nowait
+#endif
+	for (i = ib; i <= ie; i++) {
+		for (j = jb; j <= je; j++)
+		{
+			odx = (i + shx) * nyz + (j + shy) * nz + posz;
+			idx = i * subnyz + j * subnz + kb;
+
+			memcpy(&out[odx], &in[idx], block_size);
+		}
+	}
+}
+
+// * implementation: get number of colored elements //
+// ------------------------------------------------ //
+inline int nse::get_num_colored(const int color,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke)
+{
+	const int length = ie - ib + 1;
+	const int width = je - jb + 1;
+	const int height = ke - kb + 1;
+
+	if ((length & 1) && (width & 1) && (height & 1)) {
+
+		const int sh = !((ib + jb + kb + color) & 1);
+		return ((length - 1) >> 1) * width * height +
+			((width - 1) >> 1) * height +
+			((height + sh) >> 1);
+
+	}
+	else
+		return ((length * width * height) >> 1);
+}
diff --git a/grid-id.h b/grid-id.h
new file mode 100644
index 0000000000000000000000000000000000000000..c76974136ce59f21798849834ca08315a4487a9b
--- /dev/null
+++ b/grid-id.h
@@ -0,0 +1,288 @@
+#pragma once
+
+// [grid-id.h]: grid data identifier
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "mpi-com.h"
+
+#include <string.h>
+#ifdef USE_CXX_11
+#include <initializer_list>
+#endif
+
+//#define _USE_DEPRECATED_WST_FORMAT
+
+namespace nse
+{
+	enum maskType {
+		solidCell = 0, fluidCell = 1, externalCell = 2
+	};
+
+	template< typename T >
+	class GridId {
+	public:
+
+		GridId();
+		GridId(const GridId& id);
+		~GridId();
+
+		void init();	// initialize id 
+
+						// set:
+		void set_grid_type(const int grid_type);
+		void set_dim_num(const int ndim);
+		void set_domain_dim(const int dim, const T x, const T length);
+		void set_grid_dim(const int dim, const int nx, const int gcx);
+
+		void reset_data_type_size();	// to: [sizeof(T)]
+
+										// set specs:
+#ifdef USE_CXX_11
+		void set_domain_specs(const std::initializer_list<T> specs);
+		void set_grid_specs(const std::initializer_list<int> specs);
+#else
+		void set_domain_specs(int nspecs, const T* specs);
+		void set_grid_specs(int nspecs, const int* specs);
+#endif
+
+		// header:
+		int key() const;
+		int grid_type() const;
+		int dim_num() const;
+		int data_type_size() const;
+
+		// get grid, domain by dim:
+		void domain_dim(const int dim, T* x, T* length) const;
+		void grid_dim(const int dim, int* nx, int* gcx) const;
+
+		// get specs:
+		T domain_spec(int idx) const;
+		int grid_spec(int idx) const;
+
+		// check id based on header values and input num dims
+		bool check(const int ndim) const;
+
+		// broadcast GridId object on communicator
+		void mpi_broadcast(const int host, const MPI_Comm comm);
+		// ------------------------------------------------------------------------------------ //
+
+
+		// static constants
+		// ------------------------------------------------------------------------------------ //
+		static const int max_dim = 3;	// using <3> to comply with WST-format
+
+		static const int hsize = 4;
+		static const int dsize = 24;
+		static const int gsize = 24;
+
+		static const int id_byte_size =
+			hsize * sizeof(int) + dsize * sizeof(T) + gsize * sizeof(int);
+
+		// static constants (read 3D data only), switch: comply with WST-format
+		// ------------------------------------------------------------------------------------ //
+#ifndef _USE_DEPRECATED_WST_FORMAT
+		static const int hsize_r3d = hsize;
+		static const int dsize_r3d = dsize;
+		static const int gsize_r3d = gsize;
+#else
+		static const int hsize_r3d = 4;
+		static const int dsize_r3d = 7;
+		static const int gsize_r3d = 6;
+#endif
+
+		static const int id_byte_size_r3d =
+			hsize_r3d * sizeof(int) + dsize_r3d * sizeof(T) + gsize_r3d * sizeof(int);
+
+
+	public:
+		// data triple [header,domain,grid]
+		// ------------------------------------------------------------------------------------ //
+
+		int header[hsize];
+		T domain[dsize];
+		int grid[gsize];
+	};
+}
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+nse::GridId<T>::GridId()
+{
+	memset(header, 0, GridId<T>::hsize * sizeof(int));
+	memset(domain, 0, GridId<T>::dsize * sizeof(T));
+	memset(grid, 0, GridId<T>::gsize * sizeof(int));
+}
+
+template< typename T >
+nse::GridId<T>::GridId(const GridId& id)
+{
+	memcpy(header, id.header, GridId<T>::hsize * sizeof(int));
+	memcpy(domain, id.domain, GridId<T>::dsize * sizeof(T));
+	memcpy(grid, id.grid, GridId<T>::gsize * sizeof(int));
+}
+
+template< typename T >
+nse::GridId<T>::~GridId() { }
+
+template< typename T >
+void nse::GridId<T>::init()
+{
+	memset(header, 0, GridId<T>::hsize * sizeof(int));
+	memset(domain, 0, GridId<T>::dsize * sizeof(T));
+	memset(grid, 0, GridId<T>::gsize * sizeof(int));
+
+	header[0] = 'n' + 's' + 'e';	// file identifier //
+	header[3] = sizeof(T);			// data type size //
+}
+
+template< typename T >
+void nse::GridId<T>::set_grid_type(const int grid_type)
+{
+	if (grid_type >= 0)
+		header[1] = grid_type;
+}
+
+template< typename T >
+void nse::GridId<T>::set_dim_num(const int ndim)
+{
+	if ((ndim <= 0) || (ndim > max_dim)) return;
+
+	header[2] = ndim;
+}
+
+template< typename T >
+void nse::GridId<T>::set_domain_dim(const int dim, const T x, const T length)
+{
+	if ((dim <= 0) || (dim > dim_num())) return;
+
+	domain[dim - 1] = x;
+	domain[max_dim + dim - 1] = length;
+}
+
+template< typename T >
+void nse::GridId<T>::set_grid_dim(const int dim, const int nx, const int gcx)
+{
+	if ((dim <= 0) || (dim > dim_num())) return;
+
+	grid[dim - 1] = nx;
+	grid[max_dim + dim - 1] = gcx;
+}
+
+template< typename T >
+void nse::GridId<T>::reset_data_type_size()
+{
+	header[3] = sizeof(T);
+}
+
+#ifdef USE_CXX_11
+template< typename T >
+void nse::GridId<T>::set_domain_specs(const std::initializer_list<T> specs)
+{
+	int ptr = 2 * max_dim;
+	for (auto sp : specs) {
+		domain[ptr] = sp;
+		ptr++;
+
+		if (ptr >= GridId<T>::dsize) break;
+	}
+}
+
+template< typename T >
+void nse::GridId<T>::set_grid_specs(const std::initializer_list<int> specs)
+{
+	int ptr = 2 * max_dim;
+	for (auto sp : specs) {
+		grid[ptr] = sp;
+		ptr++;
+
+		if (ptr >= GridId<T>::gsize) break;
+	}
+}
+#else
+template< typename T >
+void nse::GridId<T>::set_domain_specs(int nspecs, const T* specs)
+{
+	const int ptr = 2 * max_dim;
+	for (int k = 0; k < nspecs; k++) {
+		if (ptr + k >= GridId<T>::dsize) break;
+
+		domain[ptr + k] = specs[k];
+	}
+}
+
+template< typename T >
+void nse::GridId<T>::set_grid_specs(int nspecs, const int* specs)
+{
+	const int ptr = 2 * max_dim;
+	for (int k = 0; k < nspecs; k++) {
+		if (ptr + k >= GridId<T>::gsize) break;
+
+		grid[ptr + k] = specs[k];
+	}
+}
+#endif
+
+template< typename T >
+inline int nse::GridId<T>::key() const { return header[0]; }
+
+template< typename T >
+inline int nse::GridId<T>::grid_type() const { return header[1]; }
+
+template< typename T >
+inline int nse::GridId<T>::dim_num() const { return header[2]; }
+
+template< typename T >
+inline int nse::GridId<T>::data_type_size() const { return header[3]; }
+
+template< typename T >
+void nse::GridId<T>::domain_dim(const int dim, T* x, T* length) const
+{
+	if ((dim <= 0) || (dim > dim_num())) return;
+
+	(*x) = domain[dim - 1];
+	(*length) = domain[max_dim + dim - 1];
+}
+
+template< typename T >
+void nse::GridId<T>::grid_dim(const int dim, int* nx, int* gcx) const
+{
+	if ((dim <= 0) || (dim > dim_num())) return;
+
+	(*nx) = grid[dim - 1];
+	(*gcx) = grid[max_dim + dim - 1];
+}
+
+template< typename T >
+T nse::GridId<T>::domain_spec(int idx) const
+{
+	return domain[2 * max_dim + idx];
+}
+
+template< typename T >
+int nse::GridId<T>::grid_spec(int idx) const
+{
+	return grid[2 * max_dim + idx];
+}
+
+template< typename T >
+bool nse::GridId<T>::check(const int ndim) const
+{
+	return (
+		(key() == 'n' + 's' + 'e') &&			// file identifier
+		(dim_num() == ndim) &&					// dims - strict
+		((data_type_size() == sizeof(float)) ||
+		(data_type_size() == sizeof(double)))	// appropriate data type
+		);
+}
+
+template< typename T >
+void nse::GridId<T>::mpi_broadcast(const int host, const MPI_Comm comm)
+{
+	nse::mpi_broadcast(header, GridId<T>::hsize, host, comm);
+	nse::mpi_broadcast(domain, GridId<T>::dsize, host, comm);
+	nse::mpi_broadcast(grid, GridId<T>::gsize, host, comm);
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/grid3d.h b/grid3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..2965570eddb56181c0c9f2c61aea79a35cd7ca65
--- /dev/null
+++ b/grid3d.h
@@ -0,0 +1,1526 @@
+#pragma once
+
+// [grid3d.h]: 3D primary virtual grid
+//
+// -------------------------------------------------------------------------------------------- //
+// TO DO:
+// 
+
+#include "nse-sys.h"
+#include "mpi-com3d.h"
+#include "cart-sys3d.h"
+#include "vecmath.h"
+
+#include "grid-id.h"
+
+#ifndef EXCLUDE_GPU_BRANCH
+#include "grid.cuh"
+#endif
+
+// use hand-coded openmp array reduction 
+//   in -xy averaging without critical sections
+//   * - use on Intel Xeon Phi
+#define USE_OMP_PAR_REDUCE_IN_AVG_XY
+
+#define USE_GRID3D_BINARY_LOCATE
+
+
+namespace nse
+{
+	namespace nse_const3d 
+	{
+		enum nodeType {
+			nodeU = 0, nodeV = 1, nodeW = 2, nodeC = 3,
+			nodeUV = 4, nodeUW = 5, nodeVW = 6, nodeUVW = 7
+		};
+	}
+
+	// --- grid node after averaging
+	nse_const3d::nodeType avg_node(
+		const nse_const3d::nodeType node,
+		const nse_const3d::axisType axis);
+	// ------------------------------------------------------------------------------------------------ //
+
+
+	// * 3D primart grid: Grid3d< T > [ T = float, double ] * //
+	// =======================================================================
+	template< typename T, memType mem = memCPU >
+	class Grid3d
+	{
+	public:
+		Grid3d();
+		Grid3d(const Grid3d& grid);
+		virtual ~Grid3d();
+
+		// grid parameters by axis
+		int dim_size(const nse_const3d::axisType axis) const;
+		int mpi_dim_size(const nse_const3d::axisType axis) const;
+
+		int ghost_region_size(	// single layer //
+			const nse_const3d::axisType axis) const;	
+
+
+		// MPI global cell index [== -1 - on failure]
+		virtual int mpi_locate_x(const T x) const = 0;
+		virtual int mpi_locate_y(const T y) const = 0;
+		virtual int mpi_locate_z(const T z) const = 0;
+
+		// local cell index [== -1 - on failure]
+		//  - only one MPI process returns >=0 for (x,y,z) calls
+		virtual int locate_x(const T x) const;
+		virtual int locate_y(const T y) const;
+		virtual int locate_z(const T z) const;
+
+		// local local cell index on single MPI process [== -1 - on failure]
+		//  - searching in segment [iexp - iwidth, iexp + iwdith]
+		virtual int locate_local_x(const T x, const int iexp, const int iwidth) const;
+		virtual int locate_local_y(const T y, const int jexp, const int jwidth) const;
+		virtual int locate_local_z(const T z, const int kexp, const int kwidth) const;
+		//	* search for a list of coordinates
+		virtual void locate_local_x(const T* _RESTRICT const x,
+			int* _RESTRICT iexp, const int iwidth, const int n) const;
+		virtual void locate_local_y(const T* _RESTRICT const y,
+			int* _RESTRICT jexp, const int jwidth, const int n) const;
+		virtual void locate_local_z(const T* _RESTRICT const z,
+			int* _RESTRICT kexp, const int kwidth, const int n) const;
+
+		// (i,j,k) MPI-local coordinates [input - global index]
+		//	- only one MPI process returns >= 0
+		virtual int i_local_coord(const int i) const;
+		virtual int j_local_coord(const int j) const;
+		virtual int k_local_coord(const int k) const;
+
+		// interpolation (local relative to (x,y,z) position in processor domain)
+		virtual T c_interp(const T* X, const T x, const T y, const T z) const = 0;
+		virtual T u_interp(const T* U, const T x, const T y, const T z) const = 0;
+		virtual T v_interp(const T* V, const T x, const T y, const T z) const = 0;
+		virtual T w_interp(const T* W, const T x, const T y, const T z) const = 0;
+
+		// local interpolation on single MPI process 
+		// [unsafe] - not checking if coordinates [(x,y,z),(i,j,k)] are correct
+		virtual T u_interp_local(const T* _RESTRICT const U,
+			const T x, const T y, const T z, const int i, const int j, const int k) const = 0;
+		virtual T v_interp_local(const T* _RESTRICT const V,
+			const T x, const T y, const T z, const int i, const int j, const int k) const = 0;
+		virtual T w_interp_local(const T* _RESTRICT const W,
+			const T x, const T y, const T z, const int i, const int j, const int k) const = 0;
+
+		virtual void u_interp_local(T* _RESTRICT uinterp, const T* _RESTRICT const U,
+			const T* _RESTRICT const x, const T* _RESTRICT const y, const T* _RESTRICT const z,
+			const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const = 0;
+		virtual void v_interp_local(T* _RESTRICT vinterp, const T* _RESTRICT const V,
+			const T* _RESTRICT const x, const T* _RESTRICT const y, const T* _RESTRICT const z,
+			const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const = 0;
+		virtual void w_interp_local(T* _RESTRICT winterp, const T* _RESTRICT const W,
+			const T* _RESTRICT const x, const T* _RESTRICT const y, const T* _RESTRICT const z,
+			const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const = 0;
+
+		// interpolation (global to (x,y,z) point position)
+		virtual T mpi_c_interp(const T* X, const T x, const T y, const T z) const;
+		virtual T mpi_u_interp(const T* U, const T x, const T y, const T z) const;
+		virtual T mpi_v_interp(const T* V, const T x, const T y, const T z) const;
+		virtual T mpi_w_interp(const T* W, const T x, const T y, const T z) const;
+
+		// slicing
+		// -xy slice, [C,U,V,W] -> [C,U,V,C]
+		void c_slice_at_z(T* Pxy, const T* X, const T z) const;
+		void u_slice_at_z(T* Pxy, const T* U, const T z) const;
+		void v_slice_at_z(T* Pxy, const T* V, const T z) const;
+		void w_slice_at_z(T* Pxy, const T* W, const T z) const;
+		// -xz slice, [C,U,V,W] -> [C,U,C,W]
+		void c_slice_at_y(T* Pxz, const T* X, const T y) const;
+		void u_slice_at_y(T* Pxz, const T* U, const T y) const;
+		void v_slice_at_y(T* Pxz, const T* V, const T y) const;
+		void w_slice_at_y(T* Pxz, const T* W, const T y) const;
+		// -yz slice [C,U,V,W] -> [C,C,V,W]
+		void c_slice_at_x(T* Pyz, const T* X, const T x) const;
+		void u_slice_at_x(T* Pyz, const T* U, const T x) const;
+		void v_slice_at_x(T* Pyz, const T* V, const T x) const;
+		void w_slice_at_x(T* Pyz, const T* W, const T x) const;
+
+		// MPI gather-scatter by axis //
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename Tin >
+		void mpi_gather(Tin* _RESTRICT out, const Tin* _RESTRICT in,
+			const int host, const nse_const3d::axisType axis) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename Tin >
+		void mpi_scatter(Tin* _RESTRICT out, const Tin* _RESTRICT in,
+			const int host, const nse_const3d::axisType axis) const;
+
+		// MPI gather coordinates //
+		template< memType memOUT = memCPU >
+		void mpi_gather_center_coord(T* _RESTRICT out,
+			const int host, const nse_const3d::axisType axis) const;
+		template< memType memOUT = memCPU >
+		void mpi_gather_edge_coord(T* _RESTRICT out,
+			const int host, const nse_const3d::axisType axis) const;
+
+		template< memType memOUT = memCPU >
+		void mpi_gather_center_coord(
+			T* _RESTRICT xout, T* _RESTRICT yout, T* _RESTRICT zout, const int host) const;
+		template< memType memOUT = memCPU >
+		void mpi_gather_edge_coord(
+			T* _RESTRICT xout, T* _RESTRICT yout, T* _RESTRICT zout, const int host) const;
+
+
+		// grid re-interpolation out(current grid), in(input grid)
+		virtual void grid_reinterp(T* Xout, const T* Xin,				// local in array //
+			const nse_const3d::nodeType node, const GridId< T >& id) const = 0;
+
+		// GridId on 3D grids //
+		virtual void set_id(GridId< T >& id) const;
+		virtual bool check_id(const GridId< T >& id) const;
+
+		bool check_id_dims(const GridId< T >& id) const;
+
+
+		virtual void set_id(GridId< T >& id, const nse_const3d::axisType axis) const;
+		virtual bool check_id(const GridId< T >& id, const nse_const3d::axisType axis) const;
+
+		bool check_id_dims(const GridId< T >& id, const nse_const3d::axisType axis) const;
+
+	public:
+
+		mpiCom3d mpi_com;
+
+		int size, nx, ny, nz, nyz;
+		int mpi_nx, mpi_ny, mpi_nz,
+			mpi_nxy, mpi_nxz, mpi_nyz, mpi_size;
+
+		int gcx, gcy, gcz;
+
+		T x, y, z;
+		T length, width, height;
+
+		T mpi_x, mpi_y, mpi_z;
+		T mpi_length, mpi_width, mpi_height;
+
+		T *px, *py, *pz;		// cell-center coordinates //
+		T *ex, *ey, *ez;		// cell-edge coordinates //
+	};
+}
+
+// Implementation [misc]
+// -------------------------------------------------------------------------------------------- //
+inline
+nse::nse_const3d::nodeType nse::avg_node(const nse_const3d::nodeType node,
+	const nse_const3d::axisType axis)
+{
+	if (axis == nse_const3d::axisX) {
+		if ((node == nse_const3d::nodeU) || (node == nse_const3d::nodeUV) ||
+			(node == nse_const3d::nodeUW) || (node == nse_const3d::nodeUVW))
+			return nse_const3d::nodeU;
+
+		return nse_const3d::nodeC;
+	}
+	if (axis == nse_const3d::axisY) {
+		if ((node == nse_const3d::nodeV) || (node == nse_const3d::nodeUV) ||
+			(node == nse_const3d::nodeVW) || (node == nse_const3d::nodeUVW))
+			return nse_const3d::nodeV;
+
+		return nse_const3d::nodeC;
+	}
+	if (axis == nse_const3d::axisZ) {
+		if ((node == nse_const3d::nodeW) || (node == nse_const3d::nodeUW) ||
+			(node == nse_const3d::nodeVW) || (node == nse_const3d::nodeUVW))
+			return nse_const3d::nodeW;
+
+		return nse_const3d::nodeC;
+	}
+
+	if (axis == nse_const3d::axisXY) {
+		if ((node == nse_const3d::nodeV) || (node == nse_const3d::nodeVW))
+			return nse_const3d::nodeV;
+		if ((node == nse_const3d::nodeU) || (node == nse_const3d::nodeUW))
+			return nse_const3d::nodeU;
+		if ((node == nse_const3d::nodeUV) || (node == nse_const3d::nodeUVW))
+			return nse_const3d::nodeUV;
+
+		return nse_const3d::nodeC;
+	}
+	if (axis == nse_const3d::axisXZ) {
+		if ((node == nse_const3d::nodeW) || (node == nse_const3d::nodeVW))
+			return nse_const3d::nodeW;
+		if ((node == nse_const3d::nodeU) || (node == nse_const3d::nodeUV))
+			return nse_const3d::nodeU;
+		if ((node == nse_const3d::nodeUW) || (node == nse_const3d::nodeUVW))
+			return nse_const3d::nodeUW;
+
+		return nse_const3d::nodeC;
+	}
+	if (axis == nse_const3d::axisYZ) {
+		if ((node == nse_const3d::nodeW) || (node == nse_const3d::nodeUW))
+			return nse_const3d::nodeW;
+		if ((node == nse_const3d::nodeV) || (node == nse_const3d::nodeUV))
+			return nse_const3d::nodeV;
+		if ((node == nse_const3d::nodeVW) || (node == nse_const3d::nodeUVW))
+			return nse_const3d::nodeVW;
+
+		return nse_const3d::nodeC;
+	}
+
+	// axis == nse_const3d::axisXYZ
+	return node;
+}
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation, Grid3d:
+// -------------------------------------------------------------------------------------------- //
+namespace nse
+{
+	template< typename T, memType mem >
+	Grid3d< T, mem > ::Grid3d(
+	) : mpi_com(),
+		size(0), nx(0), ny(0), nz(0), nyz(0),
+		mpi_size(0), mpi_nx(0), mpi_ny(0), mpi_nz(0),
+		mpi_nxy(0), mpi_nxz(0), mpi_nyz(0),
+		gcx(0), gcy(0), gcz(0),
+
+		x((T)0), y((T)0), z((T)0),
+		length((T)0), width((T)0), height((T)0),
+		mpi_x((T)0), mpi_y((T)0), mpi_z((T)0),
+		mpi_length((T)0), mpi_width((T)0), mpi_height((T)0)
+	{
+	}
+
+	template< typename T, memType mem >
+	Grid3d< T, mem > ::Grid3d(
+		const Grid3d< T, mem >& grid)
+		: mpi_com(grid.mpi_com),
+		size(grid.size), nx(grid.nx), ny(grid.ny), nz(grid.nz), nyz(grid.nyz),
+		mpi_size(grid.size), mpi_nx(grid.mpi_nx), mpi_ny(grid.mpi_ny), mpi_nz(grid.mpi_nz),
+		mpi_nxy(grid.mpi_nxy), mpi_nxz(grid.mpi_nxz), mpi_nyz(grid.mpi_nyz),
+		gcx(grid.gcx), gcy(grid.gcy), gcz(grid.gcz),
+
+		x(grid.x), y(grid.y), z(grid.z),
+		mpi_x(grid.mpi_x), mpi_y(grid.mpi_y), mpi_z(grid.mpi_z),
+
+		length(grid.length), width(grid.width), height(grid.mpi_height),
+		mpi_length(grid.mpi_length), mpi_width(grid.mpi_width), mpi_height(grid.mpi_height)
+	{
+		if (size > 0) {
+			allocate<mem>(&px, nx); mcopy<mem, mem>(px, grid.px, nx);
+			allocate<mem>(&py, ny); mcopy<mem, mem>(py, grid.py, ny);
+			allocate<mem>(&pz, nz); mcopy<mem, mem>(pz, grid.pz, nz);
+
+			allocate<mem>(&ex, nx); mcopy<mem, mem>(ex, grid.ex, nx);
+			allocate<mem>(&ey, ny); mcopy<mem, mem>(ey, grid.ey, ny);
+			allocate<mem>(&ez, nz); mcopy<mem, mem>(ez, grid.ez, nz);
+		}
+	}
+
+	template< typename T, memType mem >
+	Grid3d< T, mem > :: ~Grid3d(
+	)
+	{
+		if (size > 0) {
+
+			deallocate<mem>(px); deallocate<mem>(py); deallocate<mem>(pz);
+			deallocate<mem>(ex); deallocate<mem>(ey); deallocate<mem>(ez);
+		}
+
+	}
+
+	template< typename T, memType mem >
+	int Grid3d< T, mem > ::dim_size(const nse_const3d::axisType axis) const
+	{
+		switch (axis)
+		{
+		case nse_const3d::axisX: return nx;
+		case nse_const3d::axisY: return ny;
+		case nse_const3d::axisZ: return nz;
+		case nse_const3d::axisXY: return nx * ny;
+		case nse_const3d::axisXZ: return nx * nz;
+		case nse_const3d::axisYZ: return nyz;
+		case nse_const3d::axisXYZ: return size;
+		default: return -1;
+		}
+	}
+
+	template< typename T, memType mem >
+	int Grid3d< T, mem > ::mpi_dim_size(const nse_const3d::axisType axis) const
+	{
+		switch (axis)
+		{
+		case nse_const3d::axisX: return mpi_nx;
+		case nse_const3d::axisY: return mpi_ny;
+		case nse_const3d::axisZ: return mpi_nz;
+		case nse_const3d::axisXY: return mpi_nxy;
+		case nse_const3d::axisXZ: return mpi_nxz;
+		case nse_const3d::axisYZ: return mpi_nyz;
+		case nse_const3d::axisXYZ: return mpi_size;
+		default: return -1;
+		}
+	}
+
+	template< typename T, memType mem >
+	int Grid3d< T, mem > ::ghost_region_size(const nse_const3d::axisType axis) const
+	{
+		switch (axis)
+		{
+		case nse_const3d::axisX: return gcx;
+		case nse_const3d::axisY: return gcy;
+		case nse_const3d::axisZ: return gcz;
+		case nse_const3d::axisXY: return gcx * gcy;
+		case nse_const3d::axisXZ: return gcx * gcz;
+		case nse_const3d::axisYZ: return gcy * gcz;
+		case nse_const3d::axisXYZ: return gcx * gcy * gcz;
+		default: return -1;
+		}
+	}
+
+	template< typename T, memType mem >
+	inline int Grid3d< T, mem > ::locate_x(const T _x) const
+	{
+#ifndef EXCLUDE_GPU_BRANCH
+		if (mem == memGPU) {
+			if (mpi_com.rank_x == 0)
+				return nse_gpu::locate(_x, ex, nx, gcx, 0);
+			else
+				return nse_gpu::locate(_x, ex, nx, gcx, 1);
+		}
+		else
+#endif
+		{	// memCPU //
+
+#ifdef USE_GRID3D_BINARY_LOCATE
+			const int min_binary_size = 16;
+#endif
+
+			int i;
+			int ibeg = gcx, iend = nx - gcx - 1;
+
+
+			if (mpi_com.rank_x == 0)
+			{
+#ifdef USE_GRID3D_BINARY_LOCATE	
+				while (iend - ibeg >= min_binary_size) {
+					i = (ibeg + iend) / 2;
+					if (_x < ex[i]) {
+						iend = i - 1; continue;
+					}
+					if (_x > ex[i + 1]) {
+						ibeg = i + 1; continue;
+					}
+
+					return i;
+				}
+#endif
+				for (i = ibeg; i <= iend; i++)
+					if ((_x >= ex[i]) && (_x <= ex[i + 1])) { return i; }
+			}
+			else
+			{
+#ifdef USE_GRID3D_BINARY_LOCATE
+				while (iend - ibeg >= min_binary_size) {
+					i = (ibeg + iend) / 2;
+					if (_x <= ex[i]) {
+						iend = i - 1; continue;
+					}
+					if (_x > ex[i + 1]) {
+						ibeg = i + 1; continue;
+					}
+
+					return i;
+				}
+#endif
+				for (i = ibeg; i <= iend; i++)
+					if ((_x > ex[i]) && (_x <= ex[i + 1])) { return i; }
+			}
+
+			return -1;
+		}
+	}
+
+	template< typename T, memType mem >
+	inline int Grid3d< T, mem > ::locate_y(const T _y) const
+	{
+#ifndef EXCLUDE_GPU_BRANCH
+		if (mem == memGPU) {
+			if (mpi_com.rank_y == 0)
+				return nse_gpu::locate(_y, ey, ny, gcy, 0);
+			else
+				return nse_gpu::locate(_y, ey, ny, gcy, 1);
+		}
+		else
+#endif
+		{	// memCPU //
+
+#ifdef USE_GRID3D_BINARY_LOCATE
+			const int min_binary_size = 16;
+#endif
+
+			int j;
+			int jbeg = gcy, jend = ny - gcy - 1;
+
+
+			if (mpi_com.rank_y == 0) {
+#ifdef USE_GRID3D_BINARY_LOCATE
+
+				while (jend - jbeg >= min_binary_size) {
+					j = (jbeg + jend) / 2;
+					if (_y < ey[j]) {
+						jend = j - 1; continue;
+					}
+					if (_y > ey[j + 1]) {
+						jbeg = j + 1; continue;
+					}
+
+					return j;
+				}
+#endif
+				for (j = jbeg; j <= jend; j++)
+					if ((_y >= ey[j]) && (_y <= ey[j + 1])) { return j; }
+			}
+			else
+			{
+#ifdef USE_GRID3D_BINARY_LOCATE
+				while (jend - jbeg >= min_binary_size) {
+					j = (jbeg + jend) / 2;
+					if (_y <= ey[j]) {
+						jend = j - 1; continue;
+					}
+					if (_y > ey[j + 1]) {
+						jbeg = j + 1; continue;
+					}
+
+					return j;
+				}
+#endif
+				for (j = jbeg; j <= jend; j++)
+					if ((_y > ey[j]) && (_y <= ey[j + 1])) { return j; }
+			}
+
+			return -1;
+		}
+	}
+
+	template< typename T, memType mem >
+	inline int Grid3d< T, mem > ::locate_z(const T _z) const
+	{
+#ifndef EXCLUDE_GPU_BRANCH
+		if (mem == memGPU) {
+			if (mpi_com.rank_z == 0)
+				return nse_gpu::locate(_z, ez, nz, gcz, 0);
+			else
+				return nse_gpu::locate(_z, ez, nz, gcz, 1);
+		}
+		else
+#endif
+		{	// memCPU //
+
+#ifdef USE_GRID3D_BINARY_LOCATE
+			const int min_binary_size = 16;
+#endif
+
+			int k;
+			int kbeg = gcz, kend = nz - gcz - 1;
+
+
+			if (mpi_com.rank_z == 0) {
+#ifdef USE_GRID3D_BINARY_LOCATE
+				while (kend - kbeg >= min_binary_size) {
+					k = (kbeg + kend) / 2;
+					if (_z < ez[k]) {
+						kend = k - 1; continue;
+					}
+					if (_z > ez[k + 1]) {
+						kbeg = k + 1; continue;
+					}
+
+					return k;
+				}
+#endif
+				for (k = kbeg; k <= kend; k++)
+					if ((_z >= ez[k]) && (_z <= ez[k + 1])) { return k; }
+			}
+			else
+			{
+#ifdef USE_GRID3D_BINARY_LOCATE
+				while (kend - kbeg >= min_binary_size) {
+					k = (kbeg + kend) / 2;
+					if (_z <= ez[k]) {
+						kend = k - 1; continue;
+					}
+					if (_z > ez[k + 1]) {
+						kbeg = k + 1; continue;
+					}
+
+					return k;
+				}
+#endif
+				for (k = kbeg; k <= kend; k++)
+					if ((_z > ez[k]) && (_z <= ez[k + 1])) { return k; }
+			}
+
+			return -1;
+		}
+	}
+
+	template< typename T, memType mem >
+	inline int Grid3d< T, mem > ::locate_local_x(
+		const T _x, const int iexp, const int iwidth) const
+	{
+#ifdef USE_GRID3D_BINARY_LOCATE
+		const int min_binary_size = 16;
+#endif
+
+		int i;
+		int ibeg = max(iexp - iwidth, gcx);
+		int iend = min(iexp + iwidth, nx - gcx - 1);
+
+		if (mpi_com.rank_x == 0) {
+
+#ifdef USE_GRID3D_BINARY_LOCATE
+			while (iend - ibeg >= min_binary_size) {
+				i = (ibeg + iend) / 2;
+				if (_x < ex[i]) {
+					iend = i - 1; continue;
+				}
+				if (_x > ex[i + 1]) {
+					ibeg = i + 1; continue;
+				}
+
+				return i;
+			}
+#endif
+			for (i = ibeg; i <= iend; i++)
+				if ((_x >= ex[i]) && (_x <= ex[i + 1])) { return i; }
+		}
+		else
+		{
+#ifdef USE_GRID3D_BINARY_LOCATE
+			while (iend - ibeg >= min_binary_size) {
+				i = (ibeg + iend) / 2;
+				if (_x <= ex[i]) {
+					iend = i - 1; continue;
+				}
+				if (_x > ex[i + 1]) {
+					ibeg = i + 1; continue;
+				}
+
+				return i;
+			}
+#endif
+			for (i = ibeg; i <= iend; i++)
+				if ((_x > ex[i]) && (_x <= ex[i + 1])) { return i; }
+		}
+
+		return -1;
+	}
+
+	template< typename T, memType mem >
+	inline int Grid3d< T, mem > ::locate_local_y(
+		const T _y, const int jexp, const int jwidth) const
+	{
+#ifdef USE_GRID3D_BINARY_LOCATE
+		const int min_binary_size = 16;
+#endif
+
+		int j;
+		int jbeg = max(jexp - jwidth, gcy);
+		int jend = min(jexp + jwidth, ny - gcy - 1);
+
+		if (mpi_com.rank_y == 0) {
+#ifdef USE_GRID3D_BINARY_LOCATE
+			while (jend - jbeg >= min_binary_size) {
+				j = (jbeg + jend) / 2;
+				if (_y < ey[j]) {
+					jend = j - 1; continue;
+				}
+				if (_y > ey[j + 1]) {
+					jbeg = j + 1; continue;
+				}
+
+				return j;
+			}
+#endif
+			for (j = jbeg; j <= jend; j++)
+				if ((_y >= ey[j]) && (_y <= ey[j + 1])) { return j; }
+		}
+		else
+		{
+#ifdef USE_GRID3D_BINARY_LOCATE
+			while (jend - jbeg >= min_binary_size) {
+				j = (jbeg + jend) / 2;
+				if (_y <= ey[j]) {
+					jend = j - 1; continue;
+				}
+				if (_y > ey[j + 1]) {
+					jbeg = j + 1; continue;
+				}
+
+				return j;
+			}
+#endif
+			for (j = jbeg; j <= jend; j++)
+				if ((_y > ey[j]) && (_y <= ey[j + 1])) { return j; }
+		}
+
+		return -1;
+	}
+
+	template< typename T, memType mem >
+	inline int Grid3d< T, mem > ::locate_local_z(
+		const T _z, const int kexp, const int kwidth) const
+	{
+#ifdef USE_GRID3D_BINARY_LOCATE
+		const int min_binary_size = 16;
+#endif
+
+		int k;
+		int kbeg = max(kexp - kwidth, gcz);
+		int kend = min(kexp + kwidth, nz - gcz - 1);
+
+		if (mpi_com.rank_z == 0) {
+#ifdef USE_GRID3D_BINARY_LOCATE
+			while (kend - kbeg >= min_binary_size) {
+				k = (kbeg + kend) / 2;
+				if (_z < ez[k]) {
+					kend = k - 1; continue;
+				}
+				if (_z > ez[k + 1]) {
+					kbeg = k + 1; continue;
+				}
+
+				return k;
+			}
+#endif
+			for (k = kbeg; k <= kend; k++)
+				if ((_z >= ez[k]) && (_z <= ez[k + 1])) { return k; }
+		}
+		else
+		{
+#ifdef USE_GRID3D_BINARY_LOCATE
+			while (kend - kbeg >= min_binary_size) {
+				k = (kbeg + kend) / 2;
+				if (_z <= ez[k]) {
+					kend = k - 1; continue;
+				}
+				if (_z > ez[k + 1]) {
+					kbeg = k + 1; continue;
+				}
+
+				return k;
+			}
+#endif
+			for (k = kbeg; k <= kend; k++)
+				if ((_z > ez[k]) && (_z <= ez[k + 1])) { return k; }
+		}
+
+		return -1;
+	}
+
+	template< typename T, memType mem >
+	inline void Grid3d< T, mem > ::locate_local_x(const T* _RESTRICT const _x,
+		int* _RESTRICT ipos, const int iwidth, const int n) const
+	{
+		int m, i;
+		int ibeg, iend, idx;
+
+		if (mpi_com.rank_x == 0) {
+#pragma omp parallel for private(m, i, ibeg, iend, idx) shared(ipos)
+			for (m = 0; m < n; m++) {
+
+				ibeg = max(ipos[m] - iwidth, gcx);
+				iend = min(ipos[m] + iwidth, nx - gcx - 1);
+
+				idx = -1;
+				for (i = ibeg; i <= iend; i++)
+					if ((_x[m] >= ex[i]) && (_x[m] <= ex[i + 1])) { idx = i; break; }
+				ipos[m] = idx;
+			}
+		}
+		else
+		{
+#pragma omp parallel for private(m, i, ibeg, iend, idx) shared(ipos)
+			for (m = 0; m < n; m++) {
+
+				ibeg = max(ipos[m] - iwidth, gcx);
+				iend = min(ipos[m] + iwidth, nx - gcx - 1);
+
+				idx = -1;
+				for (i = ibeg; i <= iend; i++)
+					if ((_x[m] > ex[i]) && (_x[m] <= ex[i + 1])) { idx = i; break; }
+				ipos[m] = idx;
+			}
+		}
+	}
+
+	template< typename T, memType mem >
+	inline void Grid3d< T, mem > ::locate_local_y(const T* _RESTRICT const _y,
+		int* _RESTRICT jpos, const int jwidth, const int n) const
+	{
+		int m, j;
+		int jbeg, jend, idx;
+
+		if (mpi_com.rank_y == 0) {
+#pragma omp parallel for private(m, j, jbeg, jend, idx) shared(jpos)
+			for (m = 0; m < n; m++) {
+
+				jbeg = max(jpos[m] - jwidth, gcy);
+				jend = min(jpos[m] + jwidth, ny - gcy - 1);
+
+				idx = -1;
+				for (j = jbeg; j <= jend; j++)
+					if ((_y[m] >= ey[j]) && (_y[m] <= ey[j + 1])) { idx = j; break; }
+				jpos[m] = idx;
+			}
+		}
+		else
+		{
+#pragma omp parallel for private(m, j, jbeg, jend, idx) shared(jpos)
+			for (m = 0; m < n; m++) {
+
+				jbeg = max(jpos[m] - jwidth, gcy);
+				jend = min(jpos[m] + jwidth, ny - gcy - 1);
+
+				idx = -1;
+				for (j = jbeg; j <= jend; j++)
+					if ((_y[m] > ey[j]) && (_y[m] <= ey[j + 1])) { idx = j; break; }
+				jpos[m] = idx;
+			}
+		}
+	}
+
+	template< typename T, memType mem >
+	inline void Grid3d< T, mem > ::locate_local_z(const T* _RESTRICT const _z,
+		int* _RESTRICT kpos, const int kwidth, const int n) const
+	{
+		int m, k;
+		int kbeg, kend, idx;
+
+		if (mpi_com.rank_z == 0) {
+#pragma omp parallel for private(m, k, kbeg, kend, idx) shared(kpos)
+			for (m = 0; m < n; m++) {
+
+				kbeg = max(kpos[m] - kwidth, gcz);
+				kend = min(kpos[m] + kwidth, nz - gcz - 1);
+
+				idx = -1;
+				for (k = kbeg; k <= kend; k++)
+					if ((_z[m] >= ez[k]) && (_z[m] <= ez[k + 1])) { idx = k; break; }
+				kpos[m] = idx;
+			}
+		}
+		else
+		{
+#pragma omp parallel for private(m, k, kbeg, kend, idx) shared(kpos)
+			for (m = 0; m < n; m++) {
+
+				kbeg = max(kpos[m] - kwidth, gcz);
+				kend = min(kpos[m] + kwidth, nz - gcz - 1);
+
+				idx = -1;
+				for (k = kbeg; k <= kend; k++)
+					if ((_z[m] > ez[k]) && (_z[m] <= ez[k + 1])) { idx = k; break; }
+				kpos[m] = idx;
+			}
+		}
+	}
+
+	template< typename T, memType mem >
+	int Grid3d< T, mem > ::i_local_coord(const int i) const
+	{
+		if ((i < 0) & (i > mpi_nx - 1)) return -1;
+
+		const int shmx = (mpi_com.rank_x == 0) ? gcx : 0;
+		const int shpx = (mpi_com.rank_x == mpi_com.size_x - 1) ? gcx : 0;
+
+		int ip = i - par_local_offset(mpi_nx, gcx,
+			mpi_com.rank_x, mpi_com.size_x);
+
+		return ((ip >= gcx - shmx) && (ip < nx - gcx + shpx)) ? ip : -1;
+	}
+
+	template< typename T, memType mem >
+	int Grid3d< T, mem > ::j_local_coord(const int j) const
+	{
+		if ((j < 0) && (j > mpi_ny - 1)) return -1;
+
+		const int shmy = (mpi_com.rank_y == 0) ? gcy : 0;
+		const int shpy = (mpi_com.rank_y == mpi_com.size_y - 1) ? gcy : 0;
+
+		int jp = j - par_local_offset(mpi_ny, gcy,
+			mpi_com.rank_y, mpi_com.size_y);
+
+		return ((jp >= gcy - shmy) && (jp < ny - gcy + shpy)) ? jp : -1;
+	}
+
+	template< typename T, memType mem >
+	int Grid3d< T, mem > ::k_local_coord(const int k) const
+	{
+		if ((k < 0) && (k > mpi_nz - 1)) return -1;
+
+		const int shmz = (mpi_com.rank_z == 0) ? gcz : 0;
+		const int shpz = (mpi_com.rank_z == mpi_com.size_z - 1) ? gcz : 0;
+
+		int kp = k - par_local_offset(mpi_nz, gcz,
+			mpi_com.rank_z, mpi_com.size_z);
+
+		return ((kp >= gcz - shmz) && (kp < nz - gcz + shpz)) ? kp : -1;
+	}
+
+	template< typename T, memType mem >
+	T Grid3d< T, mem > ::mpi_c_interp(const T* X, const T _px, const T _py, const T _pz) const
+	{
+		return mpi_allreduce(
+			c_interp(X, _px, _py, _pz), MPI_SUM, mpi_com.comm);
+	}
+
+	template< typename T, memType mem >
+	T Grid3d< T, mem > ::mpi_u_interp(const T* U, const T _px, const T _py, const T _pz) const
+	{
+		return mpi_allreduce(
+			u_interp(U, _px, _py, _pz), MPI_SUM, mpi_com.comm);
+	}
+
+	template< typename T, memType mem >
+	T Grid3d< T, mem > ::mpi_v_interp(const T* V, const T _px, const T _py, const T _pz) const
+	{
+		return mpi_allreduce(
+			v_interp(V, _px, _py, _pz), MPI_SUM, mpi_com.comm);
+	}
+
+	template< typename T, memType mem >
+	T Grid3d< T, mem > ::mpi_w_interp(const T* W, const T _px, const T _py, const T _pz) const
+	{
+		return mpi_allreduce(
+			w_interp(W, _px, _py, _pz), MPI_SUM, mpi_com.comm);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::c_slice_at_z(T* Pxy, const T* X, const T _pz) const
+	{
+		int i, j, k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Pxy, nx * ny);
+		if ((k >= gcz) && (k < nz - gcz)) {
+
+			const int kpos = (_pz < pz[k]) ? k : k + 1;
+			const T alpha = (_pz - pz[kpos - 1]) / (pz[kpos] - pz[kpos - 1]);
+
+#pragma omp parallel for private( i, j, index ) shared( Pxy )
+			for (i = gcx; i < nx - gcx; i++) {
+				for (j = gcy; j < ny - gcy; j++) {
+					index = i * nyz + j * nz + kpos;
+
+					Pxy[i * ny + j] = ((T)1.0 - alpha) * X[index - 1] + alpha * X[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_z, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_z);
+		if (host_rank >= 0)
+			mpi_broadcast(Pxy, nx * ny, host_rank, mpi_com.comm_z);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::u_slice_at_z(T* Pxy, const T* U, const T _pz) const
+	{
+		int i, j, k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Pxy, nx * ny);
+		if ((k >= gcz) && (k < nz - gcz)) {
+
+			const int kpos = (_pz < pz[k]) ? k : k + 1;
+			const T alpha = (_pz - pz[kpos - 1]) / (pz[kpos] - pz[kpos - 1]);
+
+#pragma omp parallel for private( i, j, index ) shared( Pxy )
+			for (i = gcx; i < nx - gcx + 1; i++) {
+				for (j = gcy; j < ny - gcy; j++) {
+					index = i * nyz + j * nz + kpos;
+
+					Pxy[i * ny + j] = ((T)1.0 - alpha) * U[index - 1] + alpha * U[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_z, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_z);
+		if (host_rank >= 0)
+			mpi_broadcast(Pxy, nx * ny, host_rank, mpi_com.comm_z);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::v_slice_at_z(T* Pxy, const T* V, const T _pz) const
+	{
+		int i, j, k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Pxy, nx * ny);
+		if ((k >= gcz) && (k < nz - gcz)) {
+
+			const int kpos = (_pz < pz[k]) ? k : k + 1;
+			const T alpha = (_pz - pz[kpos - 1]) / (pz[kpos] - pz[kpos - 1]);
+
+#pragma omp parallel for private( i, j, index ) shared( Pxy )
+			for (i = gcx; i < nx - gcx; i++) {
+				for (j = gcy; j < ny - gcy + 1; j++) {
+					index = i * nyz + j * nz + kpos;
+
+					Pxy[i * ny + j] = ((T)1.0 - alpha) * V[index - 1] + alpha * V[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_z, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_z);
+		if (host_rank >= 0)
+			mpi_broadcast(Pxy, nx * ny, host_rank, mpi_com.comm_z);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::w_slice_at_z(T* Pxy, const T* W, const T _pz) const
+	{
+		int i, j, k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Pxy, nx * ny);
+		if ((k >= gcz) && (k < nz - gcz))
+		{
+			const int kpos = k + 1;
+			const T alpha = (_pz - ez[kpos - 1]) / (ez[kpos] - ez[kpos - 1]);
+
+#pragma omp parallel for private( i, j, index ) shared( Pxy )
+			for (i = gcx; i < nx - gcx; i++) {
+				for (j = gcy; j < ny - gcy; j++) {
+					index = i * nyz + j * nz + kpos;
+
+					Pxy[i * ny + j] = ((T)1.0 - alpha) * W[index - 1] + alpha * W[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_z, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_z);
+		if (host_rank >= 0)
+			mpi_broadcast(Pxy, nx * ny, host_rank, mpi_com.comm_z);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::c_slice_at_y(T* Pxz, const T* X, const T _py) const
+	{
+		int i, j = locate_y(_py), k;
+		int index, host_rank = -1;
+
+		null(Pxz, nx * nz);
+		if ((j >= gcy) && (j < ny - gcy)) {
+
+			const int jpos = (_py < py[j]) ? j : j + 1;
+			const T alpha = (_py - py[jpos - 1]) / (py[jpos] - py[jpos - 1]);
+
+#pragma omp parallel for private( i, k, index ) shared( Pxz )
+			for (i = gcx; i < nx - gcx; i++) {
+				for (k = gcz; k < nz - gcz; k++) {
+					index = i * nyz + jpos * nz + k;
+
+					Pxz[i * nz + k] = ((T)1.0 - alpha) * X[index - nz] + alpha * X[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_y, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_y);
+		if (host_rank >= 0)
+			mpi_broadcast(Pxz, nx * nz, host_rank, mpi_com.comm_y);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::u_slice_at_y(T* Pxz, const T* U, const T _py) const
+	{
+		int i, j = locate_y(_py), k;
+		int index, host_rank = -1;
+
+		null(Pxz, nx * nz);
+		if ((j >= gcy) && (j < ny - gcy)) {
+
+			const int jpos = (_py < py[j]) ? j : j + 1;
+			const T alpha = (_py - py[jpos - 1]) / (py[jpos] - py[jpos - 1]);
+
+#pragma omp parallel for private( i, k, index ) shared( Pxz )
+			for (i = gcx; i < nx - gcx + 1; i++) {
+				for (k = gcz; k < nz - gcz; k++) {
+					index = i * nyz + jpos * nz + k;
+
+					Pxz[i * nz + k] = ((T)1.0 - alpha) * U[index - nz] + alpha * U[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_y, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_y);
+		if (host_rank >= 0)
+			mpi_broadcast(Pxz, nx * nz, host_rank, mpi_com.comm_y);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::v_slice_at_y(T* Pxz, const T* V, const T _py) const
+	{
+		int i, j = locate_y(_py), k;
+		int index, host_rank = -1;
+
+		null(Pxz, nx * nz);
+		if ((j >= gcy) && (j < ny - gcy)) {
+
+			const int jpos = j + 1;
+			const T alpha = (_py - ey[jpos - 1]) / (ey[jpos] - ey[jpos - 1]);
+
+#pragma omp parallel for private( i, k, index ) shared( Pxz )
+			for (i = gcx; i < nx - gcx; i++) {
+				for (k = gcz; k < nz - gcz; k++) {
+					index = i * nyz + jpos * nz + k;
+
+					Pxz[i * nz + k] = ((T)1.0 - alpha) * V[index - nz] + alpha * V[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_y, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_y);
+		if (host_rank >= 0)
+			mpi_broadcast(Pxz, nx * nz, host_rank, mpi_com.comm_y);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::w_slice_at_y(T* Pxz, const T* W, const T _py) const
+	{
+		int i, j = locate_y(_py), k;
+		int index, host_rank = -1;
+
+		null(Pxz, nx * nz);
+		if ((j >= gcy) && (j < ny - gcy)) {
+
+			const int jpos = (_py < py[j]) ? j : j + 1;
+			const T alpha = (_py - py[jpos - 1]) / (py[jpos] - py[jpos - 1]);
+
+#pragma omp parallel for private( i, k, index ) shared( Pxz )
+			for (i = gcx; i < nx - gcx; i++) {
+				for (k = gcz; k < nz - gcz + 1; k++) {
+					index = i * nyz + jpos * nz + k;
+
+					Pxz[i * nz + k] = ((T)1.0 - alpha) * W[index - nz] + alpha * W[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_y, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_y);
+		if (host_rank >= 0)
+			mpi_broadcast(Pxz, nx * nz, host_rank, mpi_com.comm_y);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::c_slice_at_x(T* Pyz, const T* X, const T _px) const
+	{
+		int i = locate_x(_px), j, k;
+		int index, host_rank = -1;
+
+		null(Pyz, ny * nz);
+		if ((i >= gcx) && (i < nx - gcx)) {
+
+			const int ipos = (_px < px[i]) ? i : i + 1;
+			const T alpha = (_px - px[ipos - 1]) / (px[ipos] - px[ipos - 1]);
+
+#pragma omp parallel for private( j, k, index ) shared( Pyz )
+			for (j = gcy; j < ny - gcy; j++) {
+				for (k = gcz; k < nz - gcz; k++) {
+					index = ipos * nyz + j * nz + k;
+
+					Pyz[j * nz + k] = ((T)1.0 - alpha) * X[index - nyz] + alpha * X[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_x, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_x);
+		if (host_rank >= 0)
+			mpi_broadcast(Pyz, ny * nz, host_rank, mpi_com.comm_x);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::u_slice_at_x(T* Pyz, const T* U, const T _px) const
+	{
+		int i = locate_x(_px), j, k;
+		int index, host_rank = -1;
+
+		null(Pyz, ny * nz);
+		if ((i >= gcx) && (i < nx - gcx)) {
+
+			const int ipos = i + 1;
+			const T alpha = (_px - ex[ipos - 1]) / (ex[ipos] - ex[ipos - 1]);
+
+#pragma omp parallel for private( j, k, index ) shared( Pyz )
+			for (j = gcy; j < ny - gcy; j++) {
+				for (k = gcz; k < nz - gcz; k++) {
+					index = ipos * nyz + j * nz + k;
+
+					Pyz[j * nz + k] = ((T)1.0 - alpha) * U[index - nyz] + alpha * U[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_x, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_x);
+		if (host_rank >= 0)
+			mpi_broadcast(Pyz, ny * nz, host_rank, mpi_com.comm_x);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::v_slice_at_x(T* Pyz, const T* V, const T _px) const
+	{
+		int i = locate_x(_px), j, k;
+		int index, host_rank = -1;
+
+		null(Pyz, ny * nz);
+		if ((i >= gcx) && (i < nx - gcx)) {
+
+			const int ipos = (_px < px[i]) ? i : i + 1;
+			const T alpha = (_px - px[ipos - 1]) / (px[ipos] - px[ipos - 1]);
+
+#pragma omp parallel for private( j, k, index ) shared( Pyz )
+			for (j = gcy; j < ny - gcy + 1; j++) {
+				for (k = gcz; k < nz - gcz; k++) {
+					index = ipos * nyz + j * nz + k;
+
+					Pyz[j * nz + k] = ((T)1.0 - alpha) * V[index - nyz] + alpha * V[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_x, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_x);
+		if (host_rank >= 0)
+			mpi_broadcast(Pyz, ny * nz, host_rank, mpi_com.comm_x);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::w_slice_at_x(T* Pyz, const T* W, const T _px) const
+	{
+		int i = locate_x(_px), j, k;
+		int index, host_rank = -1;
+
+		null(Pyz, ny * nz);
+		if ((i >= gcx) && (i < nx - gcx)) {
+
+			const int ipos = (_px < px[i]) ? i : i + 1;
+			const T alpha = (_px - px[ipos - 1]) / (px[ipos] - px[ipos - 1]);
+
+#pragma omp parallel for private( j, k, index ) shared( Pyz )
+			for (j = gcy; j < ny - gcy; j++) {
+				for (k = gcz; k < nz - gcz + 1; k++) {
+					index = ipos * nyz + j * nz + k;
+
+					Pyz[j * nz + k] = ((T)1.0 - alpha) * W[index - nyz] + alpha * W[index];
+				}
+			}
+
+			MPI_Comm_rank(mpi_com.comm_x, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_x);
+		if (host_rank >= 0)
+			mpi_broadcast(Pyz, ny * nz, host_rank, mpi_com.comm_x);
+	}
+
+	template< typename T, memType mem >
+	template< memType memOUT, memType memIN, typename Tin >
+	void Grid3d< T, mem >::mpi_gather(Tin* _RESTRICT out, const Tin* _RESTRICT in,
+		const int host, const nse_const3d::axisType axis) const
+	{
+		switch (axis)
+		{
+		case nse_const3d::axisX:
+			mpi_com.gather_x<memOUT, memIN>(out, in, host, nx, gcx); break;
+		case nse_const3d::axisY:
+			mpi_com.gather_y<memOUT, memIN>(out, in, host, ny, gcy); break;
+		case nse_const3d::axisZ:
+			mpi_com.gather_z<memOUT, memIN>(out, in, host, nz, gcz); break;
+		case nse_const3d::axisXY:
+			mpi_com.gather_xy<memOUT, memIN>(out, in, host, nx, ny, gcx, gcy); break;
+		case nse_const3d::axisXZ:
+			mpi_com.gather_xz<memOUT, memIN>(out, in, host, nx, nz, gcx, gcz); break;
+		case nse_const3d::axisYZ:
+			mpi_com.gather_yz<memOUT, memIN>(out, in, host, ny, nz, gcy, gcz); break;
+		case nse_const3d::axisXYZ:
+			mpi_com.gather<memOUT, memIN>(out, in, host, nx, ny, nz, gcx, gcy, gcz); break;
+		default: return;
+		}
+	}
+
+	template< typename T, memType mem >
+	template< memType memOUT, memType memIN, typename Tin >
+	void Grid3d< T, mem >::mpi_scatter(Tin* _RESTRICT out, const Tin* _RESTRICT in,
+		const int host, const nse_const3d::axisType axis) const
+	{
+		switch (axis)
+		{
+		case nse_const3d::axisX:
+			mpi_com.scatter_x<memOUT, memIN>(out, in, host, nx, gcx); break;
+		case nse_const3d::axisY:
+			mpi_com.scatter_y<memOUT, memIN>(out, in, host, ny, gcy); break;
+		case nse_const3d::axisZ:
+			mpi_com.scatter_z<memOUT, memIN>(out, in, host, nz, gcz); break;
+		case nse_const3d::axisXY:
+			mpi_com.scatter_xy<memOUT, memIN>(out, in, host, nx, ny, gcx, gcy); break;
+		case nse_const3d::axisXZ:
+			mpi_com.scatter_xz<memOUT, memIN>(out, in, host, nx, nz, gcx, gcz); break;
+		case nse_const3d::axisYZ:
+			mpi_com.scatter_yz<memOUT, memIN>(out, in, host, ny, nz, gcy, gcz); break;
+		case nse_const3d::axisXYZ:
+			mpi_com.scatter<memOUT, memIN>(out, in, host, nx, ny, nz, gcx, gcy, gcz); break;
+		default: return;
+		}
+	}
+
+	template< typename T, memType mem  >
+	template< memType memOUT >
+	void Grid3d< T, mem >::mpi_gather_center_coord(T* _RESTRICT out,
+		const int host, const nse_const3d::axisType axis) const
+	{
+		switch (axis)
+		{
+		case nse_const3d::axisX:
+			mpi_com.gather_x<memOUT, mem>(out, px, host, nx, gcx); break;
+		case nse_const3d::axisY:
+			mpi_com.gather_y<memOUT, mem>(out, py, host, ny, gcy); break;
+		case nse_const3d::axisZ:
+			mpi_com.gather_z<memOUT, mem>(out, pz, host, nz, gcz); break;
+		default: return;
+		}
+	}
+
+	template< typename T, memType mem >
+	template< memType memOUT >
+	void Grid3d< T, mem >::mpi_gather_edge_coord(T* _RESTRICT out,
+		const int host, const nse_const3d::axisType axis) const
+	{
+		switch (axis)
+		{
+		case nse_const3d::axisX:
+			mpi_com.gather_x<memOUT, mem>(out, ex, host, nx, gcx); break;
+		case nse_const3d::axisY:
+			mpi_com.gather_y<memOUT, mem>(out, ey, host, ny, gcy); break;
+		case nse_const3d::axisZ:
+			mpi_com.gather_z<memOUT, mem>(out, ez, host, nz, gcz); break;
+		default: return;
+		}
+	}
+
+	template< typename T, memType mem >
+	template< memType memOUT >
+	void Grid3d< T, mem >::mpi_gather_center_coord(T* _RESTRICT xout, T* _RESTRICT yout, T* _RESTRICT zout,
+		const int host) const
+	{
+		mpi_gather_center_coord<memOUT>(xout, host, nse_const3d::axisX);
+		mpi_gather_center_coord<memOUT>(yout, host, nse_const3d::axisY);
+		mpi_gather_center_coord<memOUT>(zout, host, nse_const3d::axisZ);
+	}
+
+	template< typename T, memType mem >
+	template< memType memOUT >
+	void Grid3d< T, mem >::mpi_gather_edge_coord(T* _RESTRICT xout, T* _RESTRICT yout, T* _RESTRICT zout,
+		const int host) const
+	{
+		mpi_gather_edge_coord<memOUT>(xout, host, nse_const3d::axisX);
+		mpi_gather_edge_coord<memOUT>(yout, host, nse_const3d::axisY);
+		mpi_gather_edge_coord<memOUT>(zout, host, nse_const3d::axisZ);
+	}
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::set_id(GridId< T >& id) const
+	{
+		id.init();
+		id.set_dim_num(3);	// 3D //
+
+							// domain definition //
+		id.set_domain_dim(1, mpi_x, mpi_length);
+		id.set_domain_dim(2, mpi_y, mpi_width);
+		id.set_domain_dim(3, mpi_z, mpi_height);
+
+		// grid definition //
+		id.set_grid_dim(1, mpi_nx, gcx);
+		id.set_grid_dim(2, mpi_ny, gcy);
+		id.set_grid_dim(3, mpi_nz, gcz);
+	}
+
+	template< typename T, memType mem >
+	bool Grid3d< T, mem >::check_id(const GridId< T >& id) const
+	{
+		return id.check(3);	// check - 3D //
+	}
+
+	template< typename T, memType mem >
+	bool Grid3d< T, mem >::check_id_dims(const GridId< T >& id) const
+	{
+		int nxid, nyid, nzid, gcxid, gcyid, gczid;
+		id.grid_dim(1, &nxid, &gcxid);
+		id.grid_dim(2, &nyid, &gcyid);
+		id.grid_dim(3, &nzid, &gczid);
+
+		return (
+			(mpi_nx == nxid) && (mpi_ny == nyid) && (mpi_nz == nzid) &&
+			(gcx == gcxid) && (gcy == gcyid) && (gcz == gczid));
+	}
+
+
+	template< typename T, memType mem >
+	void Grid3d< T, mem >::set_id(GridId< T >& id, const nse_const3d::axisType axis) const
+	{
+		id.init();
+
+		if (axis == nse_const3d::axisXY) set_id(id);
+		if (axis == nse_const3d::axisXY) {
+			
+			id.set_dim_num(2);	// 2D //
+
+			// domain definition //
+			id.set_domain_dim(1, mpi_x, mpi_length);
+			id.set_domain_dim(2, mpi_y, mpi_width);
+
+			// grid definition //
+			id.set_grid_dim(1, mpi_nx, gcx);
+			id.set_grid_dim(2, mpi_ny, gcy);
+		}
+		if (axis == nse_const3d::axisXZ) {
+
+			id.set_dim_num(2);	// 2D //
+
+			// domain definition //
+			id.set_domain_dim(1, mpi_x, mpi_length);
+			id.set_domain_dim(2, mpi_z, mpi_height);
+
+			// grid definition //
+			id.set_grid_dim(1, mpi_nx, gcx);
+			id.set_grid_dim(2, mpi_nz, gcz);
+		}
+		if (axis == nse_const3d::axisYZ) {
+
+			id.set_dim_num(2);	// 2D //
+
+			// domain definition //
+			id.set_domain_dim(1, mpi_y, mpi_width);
+			id.set_domain_dim(2, mpi_z, mpi_height);
+
+			// grid definition //
+			id.set_grid_dim(1, mpi_ny, gcy);
+			id.set_grid_dim(2, mpi_nz, gcz);
+		}
+		if (axis == nse_const3d::axisX) {
+
+			id.set_dim_num(1);	// 1D //
+
+			// domain definition //
+			id.set_domain_dim(1, mpi_x, mpi_length);
+
+			// grid definition //
+			id.set_grid_dim(1, mpi_nx, gcx);
+		}
+		if (axis == nse_const3d::axisY) {
+
+			id.set_dim_num(1);	// 1D //
+
+			// domain definition //
+			id.set_domain_dim(1, mpi_y, mpi_width);
+
+			// grid definition //
+			id.set_grid_dim(1, mpi_ny, gcy);
+		}
+		if (axis == nse_const3d::axisZ) {
+
+			id.set_dim_num(1);	// 1D //
+
+			// domain definition //
+			id.set_domain_dim(1, mpi_z, mpi_height);
+
+			// grid definition //
+			id.set_grid_dim(1, mpi_nz, gcz);
+		}
+	}
+
+	template< typename T, memType mem >
+	bool Grid3d< T, mem >::check_id(const GridId< T >& id, const nse_const3d::axisType axis) const
+	{
+		if ((axis == nse_const3d::axisX) || 
+			(axis == nse_const3d::axisY) || 
+			(axis == nse_const3d::axisZ)) return id.check(1);		// check - 1D //
+		if ((axis == nse_const3d::axisXY) || 
+			(axis == nse_const3d::axisXZ) || 
+			(axis == nse_const3d::axisYZ)) return id.check(2);	// check - 2D //
+
+		return id.check(3);	// check - 3D //
+	}
+
+	template< typename T, memType mem >
+	bool Grid3d< T, mem >::check_id_dims(const GridId< T >& id, const nse_const3d::axisType axis) const
+	{
+		if (axis == nse_const3d::axisX) {
+			int nxid, gcxid;
+			id.grid_dim(1, &nxid, &gcxid);
+
+			return ((mpi_nx == nxid) && (gcx == gcxid));
+		}
+		if (axis == nse_const3d::axisY) {
+			int nyid, gcyid;
+			id.grid_dim(1, &nyid, &gcyid);
+
+			return ((mpi_ny == nyid) && (gcy == gcyid));
+		}
+		if (axis == nse_const3d::axisZ) {
+			int nzid, gczid;
+			id.grid_dim(1, &nzid, &gczid);
+
+			return ((mpi_nz == nzid) && (gcz == gczid));
+		}
+		if (axis == nse_const3d::axisXY) {
+			int nxid, nyid, gcxid, gcyid;
+			id.grid_dim(1, &nxid, &gcxid);
+			id.grid_dim(2, &nyid, &gcyid);
+
+			return (
+				(mpi_nx == nxid) && (mpi_ny == nyid) &&
+				(gcx == gcxid) && (gcy == gcyid));
+		}
+		if (axis == nse_const3d::axisXZ) {
+			int nxid, nzid, gcxid, gczid;
+			id.grid_dim(1, &nxid, &gcxid);
+			id.grid_dim(2, &nzid, &gczid);
+
+			return (
+				(mpi_nx == nxid) && (mpi_nz == nzid) &&
+				(gcx == gcxid) && (gcz == gczid));
+		}
+		if (axis == nse_const3d::axisYZ) {
+			int nyid, nzid, gcyid, gczid;
+			id.grid_dim(1, &nyid, &gcyid);
+			id.grid_dim(2, &nzid, &gczid);
+
+			return (
+				(mpi_ny == nyid) && (mpi_nz == nzid) &&
+				(gcy == gcyid) && (gcz == gczid));
+		}
+
+		int nxid, nyid, nzid, gcxid, gcyid, gczid;
+		id.grid_dim(1, &nxid, &gcxid);
+		id.grid_dim(2, &nyid, &gcyid);
+		id.grid_dim(3, &nzid, &gczid);
+
+		return (
+			(mpi_nx == nxid) && (mpi_ny == nyid) && (mpi_nz == nzid) &&
+			(gcx == gcxid) && (gcy == gcyid) && (gcz == gczid));
+	}
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/io-base1d.h b/io-base1d.h
new file mode 100644
index 0000000000000000000000000000000000000000..96bfc9992fc568afe0ff395701e1e114405b9655
--- /dev/null
+++ b/io-base1d.h
@@ -0,0 +1,422 @@
+#pragma once
+
+// [io-base1d.h]: I/O: 1D arrays
+//		Tecplot output
+//
+// -------------------------------------------------------------------------------------------- //
+
+#define _CRT_SECURE_NO_DEPRECATE
+#include <stdio.h>
+#include <string>
+
+#include "io-misc.h"
+
+
+namespace nse 
+{
+	// Tecplot //
+	template< typename T >
+	int write_tecplot(const std::string& filename,
+		const T* out, const T* cx,		// [array, coordinates]
+		const int nx,					// size of [array] == [coordinates]
+		const int ib, const int ie,		// output range [ib, ie] in [0, nx-1]
+
+		const char* title,				// plot title
+		const char* name,				// output array name
+		const char* name_dimx,			// dimension name
+
+		const T time);					// time marker
+
+	template< typename T >
+	int write_tecplot(const std::string& filename,
+		const T* out, const T* cx,			// [array, coordinates]
+		const T vscale, const T cscale,		// scaling of [array] & [coordinates] in output
+		const int nx,						// size of [array] == [coordinates]
+		const int ib, const int ie,			// output range [ib, ie] in [0, nx-1]
+
+		const char* title,					// plot title
+		const char* name,					// output array name
+		const char* name_dimx,				// dimension name
+
+		const T time);						// time marker
+
+	template< typename T >
+	int write_tecplot(const std::string& filename,
+		T** out, const T* cx, const int nvar,	// [array[nvar], coordinates]
+		const int nx,							// size of [array] == [coordinates]
+		const int ib, const int ie,				// output range [ib, ie] in [0, nx-1]
+
+		const char* title,						// plot title
+		const char** name,						// [nx] names for output arrays
+		const char* name_dimx,					// dimension name
+
+		const T time);							// time marker
+
+	template< typename T >
+	int write_tecplot(const std::string& filename,
+		T** out, const T* cx, const int nvar,	// [array[nvar], coordinates]
+		const int nx,							// size of [array] == [coordinates]
+		const int ib, const int ie,				// output range [ib, ie] in [0, nx-1]
+
+		const char* title,						// plot title
+		const std::string* name,				// [nx] names for output arrays
+		const char* name_dimx,					// dimension name
+
+		const T time);							// time marker
+	// -------------------------------------------------------------------------------------------- //
+
+	template< typename T >
+	bool read_plain_1d(const std::string& filename,
+		T** F, int* n);
+	template< typename T >
+	bool read_plain_1d(const std::string& filename,
+		T** F, T** coord, int* n);
+	template< typename T >
+	bool read_plain_1d(const std::string& filename,
+		T** Re, T** Im, T** coord, int* n);
+
+	template< typename T >
+	bool mpi_read_plain_1d(const std::string& filename,
+		T** F, T** coord, int* n, const MPI_Comm comm);
+	template< typename T >
+	bool mpi_read_plain_1d(const std::string& filename,
+		T** Re, T** Im, T** coord, int* n, const MPI_Comm comm);
+	// -------------------------------------------------------------------------------------------- //
+}
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+
+// Tecplot //
+template< typename T >
+int nse::write_tecplot(const std::string& filename,
+	const T* out, const T* cx,	// [array, coordinates]
+	const int nx, const int ib, const int ie,
+
+	const char* title, const char* name, const char* name_dimx,
+	const T time)
+{
+	if ((ib < 0) || (ie >= nx)) return 0;
+
+	FILE* ptr = fopen(filename.c_str(), "w");
+	if (ptr == NULL) return 0;
+
+	fprintf(ptr, " TITLE = \"%s [F(%s)]\"\n", title, name_dimx);
+	fprintf(ptr, " VARIABLES = \"%s\", \"%s\"\n", name_dimx, name);
+	fprintf(ptr, " ZONE I = %i, DATAPACKING = POINT, SOLUTIONTIME = %f\n", ie - ib + 1, time);
+
+	for (int i = ib; i <= ie; i++)
+		fprintf(ptr, "%e %e\n", cx[i], out[i]);
+
+	fclose(ptr);
+	return 1;
+}
+
+template< typename T >
+int nse::write_tecplot(const std::string& filename,
+	const T* out, const T* cx,	// [array, coordinates]
+	const T vscale, const T cscale,
+	const int nx, const int ib, const int ie,
+
+	const char* title, const char* name, const char* name_dimx,
+	const T time)
+{
+	if ((ib < 0) || (ie >= nx)) return 0;
+
+	FILE* ptr = fopen(filename.c_str(), "w");
+	if (ptr == NULL) return 0;
+
+	fprintf(ptr, " TITLE = \"%s [F(%s)]\"\n", title, name_dimx);
+	fprintf(ptr, " VARIABLES = \"%s\", \"%s\"\n", name_dimx, name);
+	fprintf(ptr, " ZONE I = %i, DATAPACKING = POINT, SOLUTIONTIME = %f\n",
+		ie - ib + 1, time);
+
+	const T ivscale = (T)1.0 / vscale;
+	const T icscale = (T)1.0 / cscale;
+	for (int i = ib; i <= ie; i++)
+		fprintf(ptr, "%e %e\n", cx[i] * icscale, out[i] * ivscale);
+
+	fclose(ptr);
+	return 1;
+}
+
+template< typename T >
+int nse::write_tecplot(const std::string& filename,
+	T** out, const T* cx, const int nvar,	// [array[nvar], coordinates]
+	const int nx, const int ib, const int ie,
+
+	const char* title, const char** name, const char* name_dimx,
+	const T time)
+{
+	if ((ib < 0) || (ie >= nx)) return 0;
+
+	FILE* ptr = fopen(filename.c_str(), "w");
+	if (ptr == NULL) return 0;
+
+	fprintf(ptr, " TITLE = \"%s [F_i(%s), i=1 .. %i]\"\n", title, name_dimx, nvar);
+	fprintf(ptr, " VARIABLES = \"%s\",", name_dimx);
+	for (int k = 0; k < nvar - 1; k++)
+		fprintf(ptr, " \"%s\",", name[k]);
+	fprintf(ptr, " \"%s\"\n", name[nvar - 1]);
+	fprintf(ptr, " ZONE I = %i, DATAPACKING = POINT, SOLUTIONTIME = %f\n",
+		ie - ib + 1, time);
+
+	for (int i = ib; i <= ie; i++)
+	{
+		fprintf(ptr, "%e", cx[i]);
+		for (int k = 0; k < nvar; k++)
+			fprintf(ptr, " %e", out[k][i]);
+		fprintf(ptr, "\n");
+	}
+
+	fclose(ptr);
+	return 1;
+}
+
+template< typename T >
+int nse::write_tecplot(const std::string& filename,
+	T** out, const T* cx, const int nvar,	// [array[nvar], coordinates]
+	const int nx, const int ib, const int ie,
+
+	const char* title, const std::string* name, const char* name_dimx,
+	const T time)
+{
+	if ((ib < 0) || (ie >= nx)) return 0;
+
+	FILE* ptr = fopen(filename.c_str(), "w");
+	if (ptr == NULL) return 0;
+
+	fprintf(ptr, " TITLE = \"%s [F_i(%s), i=1 .. %i]\"\n", title, name_dimx, nvar);
+	fprintf(ptr, " VARIABLES = \"%s\",", name_dimx);
+	for (int k = 0; k < nvar - 1; k++)
+		fprintf(ptr, " \"%s\",", name[k].c_str());
+	fprintf(ptr, " \"%s\"\n", name[nvar - 1].c_str());
+	fprintf(ptr, " ZONE I = %i, DATAPACKING = POINT, SOLUTIONTIME = %f\n",
+		ie - ib + 1, time);
+
+	for (int i = ib; i <= ie; i++)
+	{
+		fprintf(ptr, "%e", cx[i]);
+		for (int k = 0; k < nvar; k++)
+			fprintf(ptr, " %e", out[k][i]);
+		fprintf(ptr, "\n");
+	}
+
+	fclose(ptr);
+	return 1;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+bool nse::read_plain_1d(const std::string& filename,
+	T** F, int* n)
+{
+	FILE *ptr = fopen(filename.c_str(), "rt");
+	if (ptr == NULL) return false;
+
+	const int buf_alloc_size = 1024;
+	int buf_size = buf_alloc_size;
+	T *F_buf;
+	int buf_ptr = 0;
+	allocate(&F_buf, buf_size);
+
+	while (!feof(ptr)) {
+		if (fscanf(ptr, c_io_fmt<T>(), &F_buf[buf_ptr]) != 1) {
+			fclose(ptr);
+			deallocate(F_buf);
+			return false;
+		}
+
+		buf_ptr = buf_ptr + 1;
+		if (buf_ptr == buf_size) {
+			// allocate additional memory
+			T *F_mem;
+
+			buf_size += buf_alloc_size;
+			allocate(&F_mem, buf_size);
+
+			memcpy(F_mem, F_buf, sizeof(T) * buf_ptr);
+
+			deallocate(F_buf);
+			F_buf = F_mem;
+		}
+	}
+
+	(*n) = buf_ptr;
+	if ((*n) > 0) {
+		allocate(F, (*n));
+		memcpy(*F, F_buf, sizeof(T) * (*n));
+	}
+
+	deallocate(F_buf);
+
+	fclose(ptr);
+	return true;
+}
+
+template< typename T >
+bool nse::read_plain_1d(const std::string& filename,
+	T** F, T** coord, int* n)
+{
+	FILE *ptr = fopen(filename.c_str(), "rt");
+	if (ptr == NULL) return false;
+
+	const int buf_alloc_size = 1024;
+	int buf_size = buf_alloc_size;
+	T *F_buf, *c_buf;
+	int buf_ptr = 0;
+	allocate(&c_buf, &F_buf, buf_size);
+
+	while (!feof(ptr)) {
+		if (fscanf(ptr, c_io_fmt<T>(), &c_buf[buf_ptr]) != 1) {
+			fclose(ptr);
+			deallocate(c_buf, F_buf);
+			return false;
+		}
+		if (fscanf(ptr, c_io_fmt<T>(), &F_buf[buf_ptr]) != 1) {
+			fclose(ptr);
+			deallocate(c_buf, F_buf);
+			return false;
+		}
+
+		buf_ptr = buf_ptr + 1;
+		if (buf_ptr == buf_size) {
+			// allocate additional memory
+			T *c_mem, *F_mem;
+
+			buf_size += buf_alloc_size;
+			allocate(&c_mem, &F_mem, buf_size);
+
+			memcpy(c_mem, c_buf, sizeof(T) * buf_ptr);
+			memcpy(F_mem, F_buf, sizeof(T) * buf_ptr);
+
+			deallocate(c_buf, F_buf);
+			c_buf = c_mem;
+			F_buf = F_mem;
+		}
+	}
+
+	(*n) = buf_ptr;
+	if ((*n) > 0) {
+		allocate(coord, F, (*n));
+		memcpy(*coord, c_buf, sizeof(T) * (*n));
+		memcpy(*F, F_buf, sizeof(T) * (*n));
+	}
+
+	deallocate(c_buf, F_buf);
+
+	fclose(ptr);
+	return true;
+}
+
+template< typename T >
+bool nse::read_plain_1d(const std::string& filename,
+	T** Re, T** Im, T** coord, int* n)
+{
+	FILE *ptr = fopen(filename.c_str(), "rt");
+	if (ptr == NULL) return false;
+
+	const int buf_alloc_size = 1024;
+	int buf_size = buf_alloc_size;
+	T *Re_buf, *Im_buf, *c_buf;
+	int buf_ptr = 0;
+	allocate(&c_buf, &Re_buf, &Im_buf, buf_size);
+
+	while (!feof(ptr)) {
+		if (fscanf(ptr, c_io_fmt<T>(), &c_buf[buf_ptr]) != 1) {
+			fclose(ptr);
+			deallocate(c_buf, Re_buf, Im_buf);
+			return false;
+		}
+		if (fscanf(ptr, c_io_fmt<T>(), &Re_buf[buf_ptr]) != 1) {
+			fclose(ptr);
+			deallocate(c_buf, Re_buf, Im_buf);
+			return false;
+		}
+		if (fscanf(ptr, c_io_fmt<T>(), &Im_buf[buf_ptr]) != 1) {
+			fclose(ptr);
+			deallocate(c_buf, Re_buf, Im_buf);
+			return false;
+		}
+
+		buf_ptr = buf_ptr + 1;
+		if (buf_ptr == buf_size) {
+			// allocate additional memory
+			T *c_mem, *Re_mem, *Im_mem;
+
+			buf_size += buf_alloc_size;
+			allocate(&c_mem, &Re_mem, &Im_mem, buf_size);
+
+			memcpy(c_mem, c_buf, sizeof(T) * buf_ptr);
+			memcpy(Re_mem, Re_buf, sizeof(T) * buf_ptr);
+			memcpy(Im_mem, Im_buf, sizeof(T) * buf_ptr);
+
+			deallocate(c_buf, Re_buf, Im_buf);
+			c_buf = c_mem;
+			Re_buf = Re_mem;
+			Im_buf = Im_mem;
+		}
+	}
+
+	(*n) = buf_ptr;
+	if ((*n) > 0) {
+		allocate(coord, Re, Im, (*n));
+		memcpy(*coord, c_buf, sizeof(T) * (*n));
+		memcpy(*Re, Re_buf, sizeof(T) * (*n));
+		memcpy(*Im, Im_buf, sizeof(T) * (*n));
+	}
+
+	deallocate(c_buf, Re_buf, Im_buf);
+
+	fclose(ptr);
+	return true;
+}
+
+template< typename T >
+bool nse::mpi_read_plain_1d(const std::string& filename,
+	T** F, T** coord, int* n, const MPI_Comm comm)
+{
+	bool status;
+
+	int rank;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == 0) {
+		status = read_plain_1d(filename, F, coord, n);
+	}
+	mpi_broadcast(&status, 1, 0, comm);
+	if (!status) return false;
+
+	mpi_broadcast(n, 1, 0, comm);
+	if (rank != 0)
+		allocate_vnull(F, coord, *n);
+
+	mpi_broadcast(*F, *n, 0, comm);
+	mpi_broadcast(*coord, *n, 0, comm);
+	return true;
+}
+
+template< typename T >
+bool nse::mpi_read_plain_1d(const std::string& filename,
+	T** Re, T** Im, T** coord, int* n, const MPI_Comm comm)
+{
+	bool status;
+
+	int rank;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == 0) {
+		status = read_plain_1d(filename, Re, Im, coord, n);
+	}
+	mpi_broadcast(&status, 1, 0, comm);
+	if (!status) return false;
+
+	mpi_broadcast(n, 1, 0, comm);
+	if (rank != 0)
+		allocate_vnull(Re, Im, coord, *n);
+
+	mpi_broadcast(*Re, *n, 0, comm);
+	mpi_broadcast(*Im, *n, 0, comm);
+	mpi_broadcast(*coord, *n, 0, comm);
+	return true;
+}
+
+// -------------------------------------------------------------------------------------------- //
diff --git a/io-base2d.h b/io-base2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a50249491947aa771792ff81073b62cd4bce01e
--- /dev/null
+++ b/io-base2d.h
@@ -0,0 +1,1262 @@
+#pragma once
+
+// [io-base2d.h]: I/O: 2D array
+//
+// -------------------------------------------------------------------------------------------- //
+
+#define _CRT_SECURE_NO_DEPRECATE
+#include <stdio.h>
+
+#include "nse-alloc.h"
+#include "grid-id.h"
+#include "io-misc.h"
+
+#include "bin-stamp.h"
+
+namespace nse
+{
+	// Tecplot //
+	template< typename T >
+	int write_tecplot(const std::string& filename,
+		const T* out, const T* cx, const T* cy, // [array, coordinates]
+		const int nx, const int ny,
+		const int ib, const int ie,
+		const int jb, const int je,
+
+		const char* title, const char* name,
+		const char* name_dimx, const char* name_dimy,
+		const T time);
+
+	template< typename T >
+	int write_tecplot(const std::string& filename,
+		T** out, const T* cx, const T* cy, const int nvar,
+		const int nx, const int ny,
+		const int ib, const int ie,
+		const int jb, const int je,
+
+		const char* title, const char** name,
+		const char* name_dimx, const char* name_dimy,
+		const T time);
+
+	template< typename T >
+	int write_tecplot(const std::string& filename,
+		const T* uout, const T* vout, const T* cx, const T* cy, // [array, coordinates]
+		const int nx, const int ny,
+		const int ib, const int ie,
+		const int jb, const int je,
+
+		const char* title, const char* uname, const char* vname,
+		const char* name_dimx, const char* name_dimy,
+		const T time);
+	// -------------------------------------------------------------------------------------------- //
+
+	// Binary //
+	template< typename T >
+	int write_binary_stamp(const std::string& filename,
+		const binStamp< int >& index_stamp,
+		const binStamp< double >& cpu_stamp,
+
+		const T* cx, const T* cy,
+		const T* ex, const T* ey,
+		const GridId<T>& id, const T time);
+
+	template< typename T >
+	int write_binary(const std::string& filename,
+		const T* xin, const char* name,
+
+		const T* cx, const T* cy,
+		const T* ex, const T* ey,
+		const GridId<T>& id, const T time);
+
+	template< typename T >
+	int write_binary(const std::string& filename,
+		const T* uin, const T* vin,
+		const char* uname, const char* vname,
+
+		const T* cx, const T* cy,
+		const T* ex, const T* ey,
+		const GridId<T>& id, const T time);
+
+
+	template< typename T >
+	int read_binary_stamp(const std::string& filename,
+		binStamp< int >& index_stamp,
+		binStamp< double >& cpu_stamp,
+
+		T** cx, T** cy, T** ex, T** ey,
+		GridId<T>& id, T* time);
+
+	template< typename T >
+	int read_binary(const std::string& filename,
+		T** xout, char** name,
+
+		T** cx, T** cy, T** ex, T** ey,
+		GridId<T>& id, T* time);
+
+	template< typename T >
+	int read_binary(const std::string& filename,
+		T** uout, T** vout,
+		char** uname, char** vname,
+
+		T** cx, T** cy, T** ex, T** ey,
+		GridId<T>& id, T* time);
+	// -------------------------------------------------------------------------------------------- //
+
+	// MPI-I/O 2D datatype //
+	template< typename T >
+	void mpi_io_write_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+
+		const int mpi_nx, const int mpi_ny,
+		const int nx, const int ny,
+		const int gcx, const int gcy,
+		const MPI_Comm comm_x, const MPI_Comm comm_y);
+
+	template< typename T >
+	void mpi_io_read_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+
+		const int mpi_nx, const int mpi_ny,
+		const int nx, const int ny,
+		const int gcx, const int gcy,
+		const MPI_Comm comm_x, const MPI_Comm comm_y);
+	// -------------------------------------------------------------------------------------------- //
+
+	// MPI-Binary //
+	// [name, coordinates, time] - local, on head-rank
+	// [array] - distributed
+	// [id] - global, on all ranks in comm
+	template< typename T >
+	int mpi_write_binary(const std::string& filename,
+		const char* mpi_datarep,
+		const MPI_Comm comm, const int header_rank,
+		const MPI_Comm comm_x, const MPI_Comm comm_y,
+
+		const T* xin, const char* name,
+
+		const T* cx, const T* cy,
+		const T* ex, const T* ey,
+		const GridId< T >& id, const T time);
+
+	template< typename T >
+	int mpi_write_binary(const std::string& filename,
+		const char* mpi_datarep,
+		const MPI_Comm comm, const int header_rank,
+		const MPI_Comm comm_x, const MPI_Comm comm_y,
+
+		const T* uin, const T* vin,
+		const char* uname, const char* vname,
+
+		const T* cx, const T* cy,
+		const T* ex, const T* ey,
+		const GridId< T >& id, const T time);
+
+
+	template< typename T >
+	int mpi_read_binary(const std::string& filename,
+		const char* mpi_datarep,
+		const MPI_Comm comm, const int header_rank,
+		const MPI_Comm comm_x, const MPI_Comm comm_y,
+
+		T** xout, char** name,
+
+		T** cx, T** cy, T** ex, T** ey,
+		GridId< T >& id, T* time);
+
+	template< typename T >
+	int mpi_read_binary(const std::string& filename,
+		const char* mpi_datarep,
+		const MPI_Comm comm, const int header_rank,
+		const MPI_Comm comm_x, const MPI_Comm comm_y,
+
+		T** uout, T** vout,
+		char** uname, char** vname,
+
+		T** cx, T** cy, T** ex, T** ey,
+		GridId< T >& id, T* time);
+	// -------------------------------------------------------------------------------------------- //
+
+
+	template< typename T >
+	bool read_plain_2d(const std::string& filename,
+		T** F, int* nx, int* ny);
+
+	template< typename T >
+	bool mpi_read_plain_2d(const std::string& filename,
+		T** F, int* nx, int* ny, const MPI_Comm comm);
+	// -------------------------------------------------------------------------------------------- //
+}
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+
+// Tecplot //
+template< typename T >
+int nse::write_tecplot(const std::string& filename,
+	const T* out, const T* cx, const T* cy, // [array, coordinates]
+	const int nx, const int ny,
+	const int ib, const int ie,
+	const int jb, const int je,
+
+	const char* title, const char* name,
+	const char* name_dimx, const char* name_dimy,
+	const T time)
+{
+	if ((ib < 0) || (jb < 0) || (ie >= nx) || (je >= ny)) return 0;
+
+	FILE* ptr = fopen(filename.c_str(), "w");
+	if (ptr == NULL) return 0;
+
+	fprintf(ptr, " TITLE = \"%s [F(%s,%s)]\"\n", title, name_dimx, name_dimy);
+	fprintf(ptr, " VARIABLES = \"%s\", \"%s\", \"%s\"\n",
+		name_dimx, name_dimy, name);
+	fprintf(ptr, " ZONE I = %i, J = %i, DATAPACKING = POINT, SOLUTIONTIME = %f\n",
+		ie - ib + 1, je - jb + 1, time);
+
+	int i, j, idx;
+	for (j = jb; j <= je; j++)
+		for (i = ib; i <= ie; i++)
+		{
+			idx = i * ny + j;
+			fprintf(ptr, "%f %f %f\n", cx[i], cy[j], out[idx]);
+		}
+
+	fclose(ptr);
+	return 1;
+}
+
+template< typename T >
+int nse::write_tecplot(const std::string& filename,
+	T** out, const T* cx, const T* cy, const int nvar, // [array[nvar], coordinates]
+	const int nx, const int ny,
+	const int ib, const int ie,
+	const int jb, const int je,
+
+	const char* title, const char** name,
+	const char* name_dimx, const char* name_dimy,
+	const T time)
+{
+	if ((ib < 0) || (jb < 0) || (ie >= nx) || (je >= ny)) return 0;
+
+	FILE* ptr = fopen(filename.c_str(), "w");
+	if (ptr == NULL) return 0;
+
+	fprintf(ptr, " TITLE = \"%s [F_i(%s,%s), i=1 .. %i]\"\n", title, name_dimx, name_dimy, nvar);
+	fprintf(ptr, " VARIABLES = \"%s\", \"%s\",", name_dimx, name_dimy);
+	for (int k = 0; k < nvar - 1; k++)
+		fprintf(ptr, " \"%s\",", name[k]);
+	fprintf(ptr, " \"%s\"\n", name[nvar - 1]);
+	fprintf(ptr, " ZONE I = %i, J = %i, DATAPACKING = POINT, SOLUTIONTIME = %f\n",
+		ie - ib + 1, je - jb + 1, time);
+
+	int i, j, idx;
+	for (j = jb; j <= je; j++)
+		for (i = ib; i <= ie; i++)
+		{
+			idx = i * ny + j;
+			fprintf(ptr, "%f %f", cx[i], cy[j]);
+			for (int k = 0; k < nvar; k++)
+				fprintf(ptr, " %f", out[k][idx]);
+			fprintf(ptr, "\n");
+		}
+
+	fclose(ptr);
+	return 1;
+}
+
+template< typename T >
+int nse::write_tecplot(const std::string& filename,
+	const T* uout, const T* vout, const T* cx, const T* cy, // [array, coordinates]
+	const int nx, const int ny,
+	const int ib, const int ie,
+	const int jb, const int je,
+
+	const char* title, const char* uname, const char* vname,
+	const char* name_dimx, const char* name_dimy,
+	const T time)
+{
+	if ((ib < 0) || (jb < 0) || (ie >= nx - 1) || (je >= ny - 1)) return 0;
+
+	FILE* ptr = fopen(filename.c_str(), "w");
+	if (ptr == NULL) return 0;
+
+	fprintf(ptr, " TITLE = \"%s [U,V(%s,%s)]\"\n", title, name_dimx, name_dimy);
+	fprintf(ptr, " VARIABLES = \"%s\", \"%s\", \"%s\", \"%s\"\n",
+		name_dimx, name_dimy, uname, vname);
+	fprintf(ptr, " ZONE I = %i, J = %i, DATAPACKING = POINT, SOLUTIONTIME = %f\n",
+		ie - ib + 1, je - jb + 1, time);
+
+	int i, j, idx;
+	for (j = jb; j <= je; j++)
+		for (i = ib; i <= ie; i++)
+		{
+			idx = i * ny + j;
+			fprintf(ptr, "%f %f %f %f\n", cx[i], cy[j],
+				(T) 0.5 * (uout[idx] + uout[idx + ny]),
+				(T) 0.5 * (vout[idx] + vout[idx + 1]));
+		}
+
+	fclose(ptr);
+	return 1;
+}
+// -------------------------------------------------------------------------------------------- //
+
+// Binary //
+template< typename T >
+int nse::write_binary_stamp(const std::string& filename,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+
+	const T* cx, const T* cy,
+	const T* ex, const T* ey,
+	const GridId<T>& id, const T time)
+{
+	FILE* ptr = fopen(filename.c_str(), "wb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fwrite(id.header, sizeof(int), GridId<T>::hsize, ptr);	// header data //
+	nstatus += fwrite(id.domain, sizeof(T), GridId<T>::dsize, ptr);	// domain definition //
+	nstatus += fwrite(id.grid, sizeof(int), GridId<T>::gsize, ptr);	// grid definition //
+
+	int nx, ny, gcx, gcy;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+
+	// grid coordinates //
+	nstatus += fwrite(cx, sizeof(T), nx, ptr);
+	nstatus += fwrite(cy, sizeof(T), ny, ptr);
+	nstatus += fwrite(ex, sizeof(T), nx, ptr);
+	nstatus += fwrite(ey, sizeof(T), ny, ptr);
+
+	// time stamp //
+	T time_stamp = time;
+	nstatus += fwrite(&time_stamp, sizeof(T), 1, ptr);
+
+	nstatus += index_stamp.fwrite(ptr);	// index stamp //
+	nstatus += cpu_stamp.fwrite(ptr);	// cpu stamp //
+
+
+	fclose(ptr);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny) + index_stamp.size + cpu_stamp.size + 3;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::write_binary(const std::string& filename,
+	const T* xin, const char* name,
+
+	const T* cx, const T* cy,
+	const T* ex, const T* ey,
+	const GridId<T>& id, const T time)
+{
+	FILE* ptr = fopen(filename.c_str(), "wb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fwrite(id.header, sizeof(int), GridId<T>::hsize, ptr);	// header data //
+	nstatus += fwrite(id.domain, sizeof(T), GridId<T>::dsize, ptr);	// domain definition //
+	nstatus += fwrite(id.grid, sizeof(int), GridId<T>::gsize, ptr);	// grid definition //
+
+	int nx, ny, gcx, gcy;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+
+	// grid coordinates //
+	nstatus += fwrite(cx, sizeof(T), nx, ptr);
+	nstatus += fwrite(cy, sizeof(T), ny, ptr);
+	nstatus += fwrite(ex, sizeof(T), nx, ptr);
+	nstatus += fwrite(ey, sizeof(T), ny, ptr);
+
+	// time stamp //
+	T time_stamp = time;
+	nstatus += fwrite(&time_stamp, sizeof(T), 1, ptr);
+
+	// field definition //
+	int type = 0;               // scalar field
+	int name_length = strlen(name);
+
+	nstatus += fwrite(&type, sizeof(int), 1, ptr);
+	nstatus += fwrite(&name_length, sizeof(int), 1, ptr);
+	nstatus += fwrite(name, sizeof(char), name_length, ptr);
+
+	// main data //
+	nstatus += fwrite(xin, sizeof(T), nx * ny, ptr);
+
+	fclose(ptr);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny) + nx * ny + name_length + 3;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::write_binary(const std::string& filename,
+	const T* uin, const T* vin,
+	const char* uname, const char* vname,
+
+	const T* cx, const T* cy,
+	const T* ex, const T* ey,
+	const GridId<T>& id, const T time)
+{
+	FILE* ptr = fopen(filename.c_str(), "wb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fwrite(id.header, sizeof(int), GridId<T>::hsize, ptr);	// header data //
+	nstatus += fwrite(id.domain, sizeof(T), GridId<T>::dsize, ptr);	// domain definition //
+	nstatus += fwrite(id.grid, sizeof(int), GridId<T>::gsize, ptr);	// grid definition //
+
+	int nx, ny, gcx, gcy;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+
+	// grid coordinates //
+	nstatus += fwrite(cx, sizeof(T), nx, ptr);
+	nstatus += fwrite(cy, sizeof(T), ny, ptr);
+	nstatus += fwrite(ex, sizeof(T), nx, ptr);
+	nstatus += fwrite(ey, sizeof(T), ny, ptr);
+
+	// time stamp //
+	T time_stamp = time;
+	nstatus += fwrite(&time_stamp, sizeof(T), 1, ptr);
+
+	// field definition //
+	int type = 1;               // vector field
+	int name_length[2];
+	name_length[0] = strlen(uname);
+	name_length[1] = strlen(vname);
+
+	nstatus += fwrite(&type, sizeof(int), 1, ptr);
+	nstatus += fwrite(name_length, sizeof(int), 2, ptr);
+	nstatus += fwrite(uname, sizeof(char), name_length[0], ptr);
+	nstatus += fwrite(vname, sizeof(char), name_length[1], ptr);
+
+	// main data //
+	nstatus += fwrite(uin, sizeof(T), nx * ny, ptr);
+	nstatus += fwrite(vin, sizeof(T), nx * ny, ptr);
+
+	fclose(ptr);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny) + 2 * nx * ny +
+		name_length[0] + name_length[1] + 4;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::read_binary_stamp(const std::string& filename,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+
+	T** cx, T** cy, T** ex, T** ey,
+	GridId<T>& id, T* time)
+{
+	FILE* ptr = fopen(filename.c_str(), "rb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fread(id.header, sizeof(int), GridId< T >::hsize, ptr);
+	if (!id.check(2)) {	// check id failed //
+		fclose(ptr);
+		return 0;
+	}
+
+	nstatus += fread_sp(id.domain, id.data_type_size(), GridId< T >::dsize, ptr);
+	nstatus += fread(id.grid, sizeof(int), GridId< T >::gsize, ptr);
+
+	int nx, ny, gcx, gcy;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+
+	// grid coordinates //
+	allocate(cx, cy, nx, ny);
+	allocate(ex, ey, nx, ny);
+
+	nstatus += fread_sp((*cx), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*cy), id.data_type_size(), ny, ptr);
+	nstatus += fread_sp((*ex), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*ey), id.data_type_size(), ny, ptr);
+
+	nstatus += fread_sp(time, id.data_type_size(), 1, ptr);	// time stamp
+
+	nstatus += index_stamp.fread(ptr);	// index stamp //
+	nstatus += cpu_stamp.fread(ptr);	// cpu stamp //
+
+	fclose(ptr);
+	id.reset_data_type_size();	// re-setting data type 
+
+	const int nstatus_check =
+		GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny) + index_stamp.size + cpu_stamp.size + 3;
+	if (nstatus == nstatus_check) return 1;
+
+	deallocate((*cx), (*cy));
+	deallocate((*ex), (*ey));
+	return 0;
+}
+
+template< typename T >
+int nse::read_binary(const std::string& filename,
+	T** xout, char** name,
+
+	T** cx, T** cy, T** ex, T** ey,
+	GridId<T>& id, T* time)
+{
+	FILE* ptr = fopen(filename.c_str(), "rb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fread(id.header, sizeof(int), GridId< T >::hsize, ptr);
+	if (!id.check(2)) {	// check id failed //
+		fclose(ptr);
+		return 0;
+	}
+
+	nstatus += fread_sp(id.domain, id.data_type_size(), GridId< T >::dsize, ptr);
+	nstatus += fread(id.grid, sizeof(int), GridId< T >::gsize, ptr);
+
+	int nx, ny, gcx, gcy;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+
+	// grid coordinates //
+	allocate(cx, cy, nx, ny);
+	allocate(ex, ey, nx, ny);
+
+	nstatus += fread_sp((*cx), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*cy), id.data_type_size(), ny, ptr);
+	nstatus += fread_sp((*ex), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*ey), id.data_type_size(), ny, ptr);
+
+	nstatus += fread_sp(time, id.data_type_size(), 1, ptr);	// time stamp
+
+															// field definition //
+	int field_type;
+	nstatus += fread(&field_type, sizeof(int), 1, ptr);
+	if (field_type != 0) {	// not scalar field //
+		deallocate((*cx), (*cy));
+		deallocate((*ex), (*ey));
+		fclose(ptr);
+		return 0;
+	}
+
+	int name_length;
+	nstatus += fread(&name_length, sizeof(int), 1, ptr);
+
+	(*name) = new char[name_length + 1];
+	nstatus += fread((*name), sizeof(char), name_length, ptr);
+	(*name)[name_length] = '\0';
+
+	// main data //
+	allocate(xout, nx * ny);
+	nstatus += fread_sp((*xout), id.data_type_size(), nx * ny, ptr);
+
+	fclose(ptr);
+	id.reset_data_type_size();	// re-setting data type 
+
+	const int nstatus_check =
+		GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny) + nx * ny + name_length + 3;
+	if (nstatus == nstatus_check) return 1;
+
+	deallocate((*cx), (*cy));
+	deallocate((*ex), (*ey));
+	delete[](*name);
+	deallocate((*xout));
+	return 0;
+}
+
+template< typename T >
+int nse::read_binary(const std::string& filename,
+	T** uout, T** vout,
+	char** uname, char** vname,
+
+	T** cx, T** cy, T** ex, T** ey,
+	GridId<T>& id, T* time)
+{
+	FILE* ptr = fopen(filename.c_str(), "rb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fread(id.header, sizeof(int), GridId< T >::hsize, ptr);
+	if (!id.check(2)) {	// check id failed //
+		fclose(ptr);
+		return 0;
+	}
+
+	nstatus += fread_sp(id.domain, id.data_type_size(), GridId< T >::dsize, ptr);
+	nstatus += fread(id.grid, sizeof(int), GridId< T >::gsize, ptr);
+
+	int nx, ny, gcx, gcy;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+
+	// grid coordinates //
+	allocate(cx, cy, nx, ny);
+	allocate(ex, ey, nx, ny);
+
+	nstatus += fread_sp((*cx), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*cy), id.data_type_size(), ny, ptr);
+	nstatus += fread_sp((*ex), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*ey), id.data_type_size(), ny, ptr);
+
+	nstatus += fread_sp(time, id.data_type_size(), 1, ptr);	// time stamp
+
+															// field definition //
+	int field_type;
+	nstatus += fread(&field_type, sizeof(int), 1, ptr);
+	if (field_type != 1) {	// not vector field //
+		deallocate((*cx), (*cy));
+		deallocate((*ex), (*ey));
+		fclose(ptr);
+		return 0;
+	}
+
+	int name_length[2];
+	nstatus += fread(name_length, sizeof(int), 2, ptr);
+
+	(*uname) = new char[name_length[0] + 1];
+	(*vname) = new char[name_length[1] + 1];
+	nstatus += fread((*uname), sizeof(char), name_length[0], ptr);
+	nstatus += fread((*vname), sizeof(char), name_length[1], ptr);
+	(*uname)[name_length[0]] = '\0';
+	(*vname)[name_length[1]] = '\0';
+
+	// main data //
+	allocate(uout, vout, nx * ny);
+	nstatus += fread_sp((*uout), id.data_type_size(), nx * ny, ptr);
+	nstatus += fread_sp((*vout), id.data_type_size(), nx * ny, ptr);
+
+	fclose(ptr);
+	id.reset_data_type_size();	// re-setting data type 
+
+	const int nstatus_check =
+		GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny) + 2 * nx * ny +
+		name_length[0] + name_length[1] + 4;
+	if (nstatus == nstatus_check) return 1;
+
+	deallocate((*cx), (*cy));
+	deallocate((*ex), (*ey));
+	delete[](*uname); delete[](*vname);
+	deallocate((*uout), (*vout));
+	return 0;
+}
+// -------------------------------------------------------------------------------------------- //
+
+// MPI-I/O 2D datatype //
+template< typename T >
+void nse::mpi_io_write_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+
+	const int mpi_nx, const int mpi_ny,
+	const int nx, const int ny,
+	const int gcx, const int gcy,
+	const MPI_Comm comm_x, const MPI_Comm comm_y)
+{
+	const int mpi_dim_size[2] = { mpi_nx, mpi_ny };
+	const int dim_size[2] = { nx, ny };
+	const int nghost[2] = { gcx, gcy };
+	const MPI_Comm comm[2] = { comm_x, comm_y };
+
+	mpi_io_write_datatype< T, 2 >(file_view, local_view,
+		mpi_dim_size, dim_size, nghost, comm);
+}
+
+template< typename T >
+void nse::mpi_io_read_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+
+	const int mpi_nx, const int mpi_ny,
+	const int nx, const int ny,
+	const int gcx, const int gcy,
+	const MPI_Comm comm_x, const MPI_Comm comm_y)
+{
+	const int mpi_dim_size[2] = { mpi_nx, mpi_ny };
+	const int dim_size[2] = { nx, ny };
+	const int nghost[2] = { gcx, gcy };
+	const MPI_Comm comm[2] = { comm_x, comm_y };
+
+	mpi_io_read_datatype< T, 2 >(file_view, local_view,
+		mpi_dim_size, dim_size, nghost, comm);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// MPI-Binary //
+template< typename T >
+int nse::mpi_write_binary(const std::string& filename,
+	const char* mpi_datarep,
+	const MPI_Comm comm, const int header_rank,
+	const MPI_Comm comm_x, const MPI_Comm comm_y,
+
+	const T* xin, const char* name,
+
+	const T* cx, const T* cy,
+	const T* ex, const T* ey,
+	const GridId< T >& id, const T time)
+{
+	int nx, ny, gcx, gcy;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+
+	MPI_Offset header_size = GridId< T >::id_byte_size +
+		2 * (nx + ny) * sizeof(T) +
+		strlen(name) * sizeof(char) +
+		2 * sizeof(int) + sizeof(T);
+
+	MPI_File ptr;
+	int status = MPI_File_open(comm, (char*)filename.c_str(),
+		MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &ptr);
+	if (status != MPI_SUCCESS) return 0;	// MPI file open failure
+
+	int nstatus = 0;
+	int rank;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == header_rank) {	// header
+
+								// header, domain & grid id //
+		status = MPI_File_write(ptr, (void*)id.header, GridId< T >::hsize, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::hsize;
+		status = MPI_File_write(ptr, (void*)id.domain, GridId< T >::dsize, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::dsize;
+		status = MPI_File_write(ptr, (void*)id.grid, GridId< T >::gsize, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::gsize;
+
+		// grid coordinates //
+		status = MPI_File_write(ptr, (void*)cx, nx, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nx;
+		status = MPI_File_write(ptr, (void*)cy, ny, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += ny;
+		status = MPI_File_write(ptr, (void*)ex, nx, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nx;
+		status = MPI_File_write(ptr, (void*)ey, ny, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += ny;
+
+		// time stamp //
+		T time_stamp = time;
+		status = MPI_File_write(ptr, &time_stamp, 1, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+
+		// field definition //
+		int type = 0;		// scalar field
+		int name_length = strlen(name);
+
+		status = MPI_File_write(ptr, &type, 1, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+		status = MPI_File_write(ptr, &name_length, 1, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+		status = MPI_File_write(ptr, (void*)name, name_length, MPI_CHAR, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += name_length;
+	}
+	MPI_File_sync(ptr);
+	mpi_broadcast(&nstatus, 1, header_rank, comm);
+
+	int pnx = par_local_size_comm(nx, gcx, comm_x);
+	int pny = par_local_size_comm(ny, gcy, comm_y);
+
+	// main data description //
+	MPI_Datatype file_view, local_view;
+	mpi_io_write_datatype< T >(&file_view, &local_view,
+		nx, ny, pnx, pny, gcx, gcy,
+		comm_x, comm_y);
+
+	// main data //
+	MPI_File_set_view(ptr, header_size, mpi_type< T >(),
+		file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+
+	status = MPI_File_write_all(ptr, (void*)xin, 1, local_view, MPI_STATUS_IGNORE);
+	if (status == MPI_SUCCESS) nstatus += nx * ny;
+
+	MPI_File_close(&ptr);
+	MPI_Type_free(&file_view);
+	MPI_Type_free(&local_view);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny) + nx * ny + strlen(name) + 3;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::mpi_write_binary(const std::string& filename,
+	const char* mpi_datarep,
+	const MPI_Comm comm, const int header_rank,
+	const MPI_Comm comm_x, const MPI_Comm comm_y,
+
+	const T* uin, const T* vin,
+	const char* uname, const char* vname,
+
+	const T* cx, const T* cy,
+	const T* ex, const T* ey,
+	const GridId< T >& id, const T time)
+{
+	int nx, ny, gcx, gcy;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+
+	MPI_Offset header_size = GridId< T >::id_byte_size +
+		2 * (nx + ny) * sizeof(T) +
+		(strlen(uname) + strlen(vname)) * sizeof(char) +
+		3 * sizeof(int) + sizeof(T);
+
+	MPI_File ptr;
+	int status = MPI_File_open(comm, (char*)filename.c_str(),
+		MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &ptr);
+	if (status != MPI_SUCCESS) return 0;	// MPI file open failure
+
+	int nstatus = 0;
+	int rank;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == header_rank) {	// header
+
+								// header, domain & grid id //
+		status = MPI_File_write(ptr, (void*)id.header, GridId< T >::hsize, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::hsize;
+		status = MPI_File_write(ptr, (void*)id.domain, GridId< T >::dsize, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::dsize;
+		status = MPI_File_write(ptr, (void*)id.grid, GridId< T >::gsize, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::gsize;
+
+		// grid coordinates //
+		status = MPI_File_write(ptr, (void*)cx, nx, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nx;
+		status = MPI_File_write(ptr, (void*)cy, ny, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += ny;
+		status = MPI_File_write(ptr, (void*)ex, nx, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nx;
+		status = MPI_File_write(ptr, (void*)ey, ny, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += ny;
+
+		// time stamp //
+		T time_stamp = time;
+		status = MPI_File_write(ptr, &time_stamp, 1, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+
+		// field definition //
+		int type = 1;		// vector field
+		int name_length[2];
+		name_length[0] = strlen(uname);
+		name_length[1] = strlen(vname);
+
+		status = MPI_File_write(ptr, &type, 1, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+		status = MPI_File_write(ptr, name_length, 2, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 2;
+		status = MPI_File_write(ptr, (void*)uname, name_length[0], MPI_CHAR, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += name_length[0];
+		status = MPI_File_write(ptr, (void*)vname, name_length[1], MPI_CHAR, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += name_length[1];
+	}
+	MPI_File_sync(ptr);
+	mpi_broadcast(&nstatus, 1, header_rank, comm);
+
+	int pnx = par_local_size_comm(nx, gcx, comm_x);
+	int pny = par_local_size_comm(ny, gcy, comm_y);
+
+	// main data description //
+	MPI_Datatype file_view, local_view;
+	mpi_io_write_datatype< T >(&file_view, &local_view,
+		nx, ny, pnx, pny, gcx, gcy,
+		comm_x, comm_y);
+
+	// main data //
+	MPI_File_set_view(ptr, header_size, mpi_type< T >(),
+		file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+
+	status = MPI_File_write_all(ptr, (void*)uin, 1, local_view, MPI_STATUS_IGNORE);
+	if (status == MPI_SUCCESS) nstatus += nx * ny;
+	status = MPI_File_write_all(ptr, (void*)vin, 1, local_view, MPI_STATUS_IGNORE);
+	if (status == MPI_SUCCESS) nstatus += nx * ny;
+
+	MPI_File_close(&ptr);
+	MPI_Type_free(&file_view);
+	MPI_Type_free(&local_view);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny) + 2 * nx * ny +
+		strlen(uname) + strlen(vname) + 4;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::mpi_read_binary(const std::string& filename,
+	const char* mpi_datarep,
+	const MPI_Comm comm, const int header_rank,
+	const MPI_Comm comm_x, const MPI_Comm comm_y,
+
+	T** xout, char** name,
+
+	T** cx, T** cy, T** ex, T** ey,
+	GridId< T >& id, T* time)
+{
+	MPI_Offset header_size = GridId<T>::id_byte_size +
+		2 * sizeof(int);
+
+	MPI_File ptr;
+	int status = MPI_File_open(comm, (char*)filename.c_str(),
+		MPI_MODE_RDONLY, MPI_INFO_NULL, &ptr);
+	if (status != MPI_SUCCESS) return 0;	// MPI file open failure
+
+	int name_length;
+	int nstatus = 0;
+	int rank, header_offset = 0, status_id = 0;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == header_rank) {
+
+		// header, domain & grid id //
+		status = MPI_File_read(ptr, id.header, GridId<T>::hsize, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId<T>::hsize;
+		if (id.check(2)) {	// check id //
+			status = mpi_fread_sp(ptr, id.domain, GridId<T>::dsize, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += GridId<T>::dsize;
+			status = MPI_File_read(ptr, id.grid, GridId<T>::gsize, MPI_INT, MPI_STATUS_IGNORE);
+			if (status == MPI_SUCCESS) nstatus += GridId<T>::gsize;
+
+			header_offset += GridId<T>::dsize *
+				(id.data_type_size() - sizeof(T));	// correcting header size due to grid id
+
+													// grid parameters //
+			int nx, ny, gcx, gcy;
+			id.grid_dim(1, &nx, &gcx);
+			id.grid_dim(2, &ny, &gcy);
+
+			// grid coordinates //
+			allocate(cx, cy, nx, ny);
+			allocate(ex, ey, nx, ny);
+
+			status = mpi_fread_sp(ptr, (*cx), nx, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nx;
+			status = mpi_fread_sp(ptr, (*cy), ny, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += ny;
+			status = mpi_fread_sp(ptr, (*ex), nx, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nx;
+			status = mpi_fread_sp(ptr, (*ey), ny, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += ny;
+			header_offset += 2 * (nx + ny) * id.data_type_size();
+
+			// time stamp //
+			status = mpi_fread_sp(ptr, time, 1, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += 1;
+			header_offset += id.data_type_size();
+
+			// field definition //
+			int field_type;
+			status = MPI_File_read(ptr, &field_type, 1, MPI_INT, MPI_STATUS_IGNORE);
+			if (status == MPI_SUCCESS) nstatus += 1;
+			if (field_type == 0)	// scalar field //
+			{
+				status = MPI_File_read(ptr, &name_length, 1, MPI_INT, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += 1;
+				header_offset += name_length * sizeof(char);
+
+				(*name) = new char[name_length + 1];
+				status = MPI_File_read(ptr, (*name), name_length, MPI_CHAR, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += name_length;
+				(*name)[name_length] = '\0';
+
+				status_id = 1;
+			}
+			else
+			{
+				deallocate((*cx), (*cy));
+				deallocate((*ex), (*ey));
+			}
+		}
+	}
+	mpi_broadcast(&status_id, 1, header_rank, comm);
+
+	if (!status_id) {
+		MPI_File_close(&ptr);
+		return 0;
+	}
+	// read status - OK - //
+	id.mpi_broadcast(header_rank, comm);
+
+	mpi_broadcast(&nstatus, 1, header_rank, comm);
+	mpi_broadcast(&name_length, 1, header_rank, comm);
+
+	// correct header size //
+	mpi_broadcast(&header_offset, 1, header_rank, comm);
+	header_size += (MPI_Offset)header_offset;
+
+	// main data description //
+	int nx, ny, gcx, gcy;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+
+	int pnx = par_local_size_comm(nx, gcx, comm_x);
+	int pny = par_local_size_comm(ny, gcy, comm_y);
+
+	MPI_Datatype file_view, local_view;
+	if (id.data_type_size() == sizeof(float)) {	// input=[float]
+		mpi_io_read_datatype< float >(&file_view, &local_view,
+			nx, ny, pnx, pny, gcx, gcy,
+			comm_x, comm_y);
+
+		MPI_File_set_view(ptr, header_size, mpi_type< float >(),
+			file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+	}
+	if (id.data_type_size() == sizeof(double)) {	// input=[double]
+		mpi_io_read_datatype< double >(&file_view, &local_view,
+			nx, ny, pnx, pny, gcx, gcy,
+			comm_x, comm_y);
+
+		MPI_File_set_view(ptr, header_size, mpi_type< double >(),
+			file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+	}
+
+	// main data //
+	allocate(xout, pnx * pny);
+	status = mpi_fread_all_sp(ptr, (*xout), pnx * pny,
+		local_view, id.data_type_size());
+	if (status == MPI_SUCCESS) nstatus += nx * ny;
+
+	MPI_File_close(&ptr);
+	MPI_Type_free(&file_view);
+	MPI_Type_free(&local_view);
+
+	id.reset_data_type_size();	// re-setting data type
+
+	const int nstatus_check =
+		GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny) + nx * ny + name_length + 3;
+	if (nstatus == nstatus_check) return 1;
+
+	if (rank == header_rank) {
+		deallocate((*cx), (*cy));
+		deallocate((*ex), (*ey));
+		delete[](*name);
+	}
+	deallocate((*xout));
+	return 0;
+}
+
+template< typename T >
+int nse::mpi_read_binary(const std::string& filename,
+	const char* mpi_datarep,
+	const MPI_Comm comm, const int header_rank,
+	const MPI_Comm comm_x, const MPI_Comm comm_y,
+
+	T** uout, T** vout,
+	char** uname, char** vname,
+
+	T** cx, T** cy, T** ex, T** ey,
+	GridId< T >& id, T* time)
+{
+	MPI_Offset header_size = GridId<T>::id_byte_size +
+		3 * sizeof(int);
+
+	MPI_File ptr;
+	int status = MPI_File_open(comm, (char*)filename.c_str(),
+		MPI_MODE_RDONLY, MPI_INFO_NULL, &ptr);
+	if (status != MPI_SUCCESS) return 0;	// MPI file open failure
+
+	int name_length[2];
+	int nstatus = 0;
+	int rank, header_offset = 0, status_id = 0;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == header_rank) {
+
+		// header, domain & grid id //
+		status = MPI_File_read(ptr, id.header, GridId<T>::hsize, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId<T>::hsize;
+		if (id.check(2)) {	// check id //
+			status = mpi_fread_sp(ptr, id.domain, GridId<T>::dsize, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += GridId<T>::dsize;
+			status = MPI_File_read(ptr, id.grid, GridId<T>::gsize, MPI_INT, MPI_STATUS_IGNORE);
+			if (status == MPI_SUCCESS) nstatus += GridId<T>::gsize;
+
+			header_offset += GridId<T>::dsize *
+				(id.data_type_size() - sizeof(T));	// correcting header size due to grid id
+
+													// grid parameters //
+			int nx, ny, gcx, gcy;
+			id.grid_dim(1, &nx, &gcx);
+			id.grid_dim(2, &ny, &gcy);
+
+			// grid coordinates //
+			allocate(cx, cy, nx, ny);
+			allocate(ex, ey, nx, ny);
+
+			status = mpi_fread_sp(ptr, (*cx), nx, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nx;
+			status = mpi_fread_sp(ptr, (*cy), ny, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += ny;
+			status = mpi_fread_sp(ptr, (*ex), nx, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nx;
+			status = mpi_fread_sp(ptr, (*ey), ny, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += ny;
+			header_offset += 2 * (nx + ny) * id.data_type_size();
+
+			// time stamp //
+			status = mpi_fread_sp(ptr, time, 1, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += 1;
+			header_offset += id.data_type_size();
+
+			// field definition //
+			int field_type;
+			status = MPI_File_read(ptr, &field_type, 1, MPI_INT, MPI_STATUS_IGNORE);
+			if (status == MPI_SUCCESS) nstatus += 1;
+			if (field_type == 1)	// vector field //
+			{
+				status = MPI_File_read(ptr, name_length, 2, MPI_INT, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += 2;
+				header_offset += sizeof(char)*
+					(name_length[0] + name_length[1]);
+
+				(*uname) = new char[name_length[0] + 1];
+				(*vname) = new char[name_length[1] + 1];
+				status = MPI_File_read(ptr, (*uname), name_length[0], MPI_CHAR, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += name_length[0];
+				status = MPI_File_read(ptr, (*vname), name_length[1], MPI_CHAR, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += name_length[1];
+				(*uname)[name_length[0]] = '\0';
+				(*vname)[name_length[1]] = '\0';
+
+				status_id = 1;
+			}
+			else
+			{
+				deallocate((*cx), (*cy));
+				deallocate((*ex), (*ey));
+			}
+		}
+	}
+	mpi_broadcast(&status_id, 1, header_rank, comm);
+
+	if (!status_id) {
+		MPI_File_close(&ptr);
+		return 0;
+	}
+	// read status - OK - //
+	id.mpi_broadcast(header_rank, comm);
+
+	mpi_broadcast(&nstatus, 1, header_rank, comm);
+	mpi_broadcast(name_length, 2, header_rank, comm);
+
+	// correct header size //
+	mpi_broadcast(&header_offset, 1, header_rank, comm);
+	header_size += (MPI_Offset)header_offset;
+
+	// main data description //
+	int nx, ny, gcx, gcy;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+
+	int pnx = par_local_size_comm(nx, gcx, comm_x);
+	int pny = par_local_size_comm(ny, gcy, comm_y);
+
+	MPI_Datatype file_view, local_view;
+	if (id.data_type_size() == sizeof(float)) {	// input=[float]
+		mpi_io_read_datatype< float >(&file_view, &local_view,
+			nx, ny, pnx, pny, gcx, gcy,
+			comm_x, comm_y);
+
+		MPI_File_set_view(ptr, header_size, mpi_type< float >(),
+			file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+	}
+	if (id.data_type_size() == sizeof(double)) {	// input=[double]
+		mpi_io_read_datatype< double >(&file_view, &local_view,
+			nx, ny, pnx, pny, gcx, gcy,
+			comm_x, comm_y);
+
+		MPI_File_set_view(ptr, header_size, mpi_type< double >(),
+			file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+	}
+
+	// main data //
+	allocate(uout, vout, pnx * pny);
+	status = mpi_fread_all_sp(ptr, (*uout), pnx * pny,
+		local_view, id.data_type_size());
+	if (status == MPI_SUCCESS) nstatus += nx * ny;
+	status = mpi_fread_all_sp(ptr, (*vout), pnx * pny,
+		local_view, id.data_type_size());
+	if (status == MPI_SUCCESS) nstatus += nx * ny;
+
+	MPI_File_close(&ptr);
+	MPI_Type_free(&file_view);
+	MPI_Type_free(&local_view);
+
+	id.reset_data_type_size();	// re-setting data type
+
+	const int nstatus_check =
+		GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny) + 2 * nx * ny +
+		name_length[0] + name_length[1] + 4;
+	if (nstatus == nstatus_check) return 1;
+
+	if (rank == header_rank) {
+		deallocate((*cx), (*cy));
+		deallocate((*ex), (*ey));
+		delete[](*uname); delete[](*vname);
+	}
+	deallocate((*uout), (*vout));
+	return 0;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+bool nse::read_plain_2d(const std::string& filename,
+	T** F, int* nx, int* ny)
+{
+	FILE *ptr = fopen(filename.c_str(), "rt");
+	if (ptr == NULL) return false;
+
+	if (fscanf(ptr, c_io_fmt< int >(), nx) != 1) {
+		fclose(ptr);
+		return false;
+	}
+	if (fscanf(ptr, c_io_fmt< int >(), ny) != 1) {
+		fclose(ptr);
+		return false;
+	}
+
+	if (((*nx) <= 0) || ((*ny) <= 0)) {
+		fclose(ptr);
+		return false;
+	}
+
+	allocate(F, (*nx) * (*ny));
+
+	for (int k = 0; k < (*nx) * (*ny); k++)
+	{
+		if (fscanf(ptr, c_io_fmt< T >(), &((*F)[k])) != 1) {
+			fclose(ptr);
+			deallocate((*F));
+			return false;
+		}
+	}
+
+	fclose(ptr);
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+bool nse::mpi_read_plain_2d(const std::string& filename,
+	T** F, int* nx, int* ny, const MPI_Comm comm)
+{
+	bool status;
+
+	int rank;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == 0) {
+		status = read_plain_2d(filename, F, nx, ny);
+	}
+
+	mpi_broadcast(&status, 1, 0, comm);
+	if (!status) return false;
+
+	mpi_broadcast(nx, 1, 0, comm);
+	mpi_broadcast(ny, 1, 0, comm);
+	if (rank != 0)
+		allocate_vnull(F, (*nx) * (*ny));
+
+	mpi_broadcast(*F, (*nx) * (*ny), 0, comm);
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/io-base3d.h b/io-base3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..e46ed750c3929d2e886776595f7704858f1884cc
--- /dev/null
+++ b/io-base3d.h
@@ -0,0 +1,1396 @@
+#pragma once
+
+// [io-base3d.h]: I/O: 3D array
+//
+// -------------------------------------------------------------------------------------------- //
+
+#define _CRT_SECURE_NO_DEPRECATE
+#include <stdio.h>
+
+#include "nse-alloc.h"
+#include "grid-id.h"
+#include "io-misc.h"
+
+#include "bin-stamp.h"
+#include "bin-named-stamp.h"
+
+namespace nse
+{
+	// Tecplot //
+	template< typename T >
+	int write_tecplot(const std::string& filename,
+		const T* out, const T* cx, const T* cy, const T* cz, // [array, coordinates]
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const char* title, const char* name,
+		const char* name_dimx, const char* name_dimy, const char* name_dimz,
+		const T time);
+
+	template< typename T >
+	int write_tecplot(const std::string& filename,
+		const T* uout, const T* vout, const T* wout,
+		const T* cx, const T* cy, const T* cz, // [array, coordinates]
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const char* title, const char* uname, const char* vname, const char* wname,
+		const char* name_dimx, const char* name_dimy, const char* name_dimz,
+		const T time);
+	// -------------------------------------------------------------------------------------------- //
+
+
+	// Binary //
+	template< typename T >
+	int write_binary_stamp(const std::string& filename,
+		const binStamp< int >& index_stamp,
+		const binStamp< double >& cpu_stamp,
+
+		const T* cx, const T* cy, const T* cz,
+		const T* ex, const T* ey, const T* ez,
+		const GridId<T>& id, const T time);
+
+	template< typename T >
+	int write_binary_stamp(const std::string& filename,
+		const binNamedStamp< int >& index_stamp,
+		const binNamedStamp< double >& cpu_stamp,
+
+		const T* cx, const T* cy, const T* cz,
+		const T* ex, const T* ey, const T* ez,
+		const GridId<T>& id, const T time);
+
+	template< typename T >
+	int write_binary(const std::string& filename,
+		const T* xin, const char* name,
+
+		const T* cx, const T* cy, const T* cz,
+		const T* ex, const T* ey, const T* ez,
+		const GridId<T>& id, const T time);
+
+	template< typename T >
+	int write_binary(const std::string& filename,
+		const T* uin, const T* vin, const T* win,
+		const char* uname, const char* vname, const char* wname,
+
+		const T* cx, const T* cy, const T* cz,
+		const T* ex, const T* ey, const T* ez,
+		const GridId<T>& id, const T time);
+
+
+	template< typename T >
+	int read_binary_stamp(const std::string& filename,
+		binStamp< int >& index_stamp,
+		binStamp< double >& cpu_stamp,
+
+		T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+		GridId<T>& id, T* time);
+
+	template< typename T >
+	int read_binary_stamp(const std::string& filename,
+		binNamedStamp< int >& index_stamp,
+		binNamedStamp< double >& cpu_stamp,
+
+		T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+		GridId<T>& id, T* time);
+
+	template< typename T >
+	int read_binary(const std::string& filename,
+		T** xout, char** name,
+
+		T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+		GridId<T>& id, T* time);
+
+	template< typename T >
+	int read_binary(const std::string& filename,
+		T** uout, T** vout, T** wout,
+		char** uname, char** vname, char** wname,
+
+		T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+		GridId<T>& id, T* time);
+	// -------------------------------------------------------------------------------------------- //
+
+	// MPI-I/O 3D datatype //
+	template< typename T >
+	void mpi_io_write_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+
+		const int mpi_nx, const int mpi_ny, const int mpi_nz,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z);
+
+	template< typename T >
+	void mpi_io_read_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+
+		const int mpi_nx, const int mpi_ny, const int mpi_nz,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z);
+	// -------------------------------------------------------------------------------------------- //
+
+	// MPI-Binary //
+	// [name, coordinates, time] - local, on head-rank
+	// [array] - distributed
+	// [id] - global, on all ranks in comm
+	template< typename T >
+	int mpi_write_binary(const std::string& filename,
+		const char* mpi_datarep,
+		const MPI_Comm comm, const int header_rank,
+		const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z,
+
+		const T* xin, const char* name,
+
+		const T* cx, const T* cy, const T* cz,
+		const T* ex, const T* ey, const T* ez,
+		const GridId< T >& id, const T time);
+
+	template< typename T >
+	int mpi_write_binary(const std::string& filename,
+		const char* mpi_datarep,
+		const MPI_Comm comm, const int header_rank,
+		const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z,
+
+		const T* uin, const T* vin, const T* win,
+		const char* uname, const char* vname, const char* wname,
+
+		const T* cx, const T* cy, const T* cz,
+		const T* ex, const T* ey, const T* ez,
+		const GridId< T >& id, const T time);
+
+
+	template< typename T >
+	int mpi_read_binary(const std::string& filename,
+		const char* mpi_datarep,
+		const MPI_Comm comm, const int header_rank,
+		const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z,
+
+		T** xout, char** name,
+
+		T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+		GridId< T >& id, T* time);
+
+	template< typename T >
+	int mpi_read_binary(const std::string& filename,
+		const char* mpi_datarep,
+		const MPI_Comm comm, const int header_rank,
+		const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z,
+
+		T** uout, T** vout, T** wout,
+		char** uname, char** vname, char** wname,
+
+		T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+		GridId< T >& id, T* time);
+	// -------------------------------------------------------------------------------------------- //
+}
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+
+// Tecplot //
+template< typename T >
+int nse::write_tecplot(const std::string& filename,
+	const T* out, const T* cx, const T* cy, const T* cz, // [array, coordinates]
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const char* title, const char* name,
+	const char* name_dimx, const char* name_dimy, const char* name_dimz,
+	const T time)
+{
+	FILE* ptr = fopen(filename.c_str(), "w");
+	if (ptr == NULL) return 0;
+
+	fprintf(ptr, " TITLE = \"%s [F(%s,%s,%s)]\"\n",
+		title, name_dimx, name_dimy, name_dimz);
+	fprintf(ptr, " VARIABLES = \"%s\", \"%s\", \"%s\", \"%s\"\n",
+		name_dimx, name_dimy, name_dimz, name);
+	fprintf(ptr, " ZONE I = %i, J = %i, K = %i, DATAPACKING = POINT, SOLUTIONTIME = %f\n",
+		ie - ib + 1, je - jb + 1, ke - kb + 1, time);
+
+	const int nyz = ny * nz;
+	int i, j, k, idx;
+
+	for (k = kb; k <= ke; k++)
+		for (j = jb; j <= je; j++)
+			for (i = ib; i <= ie; i++)
+			{
+				idx = i * nyz + j * nz + k;
+				fprintf(ptr, "%f %f %f %f\n", cx[i], cy[j], cz[k], out[idx]);
+			}
+
+	fclose(ptr);
+	return 1;
+}
+
+template< typename T >
+int nse::write_tecplot(const std::string& filename,
+	const T* uout, const T* vout, const T* wout,
+	const T* cx, const T* cy, const T* cz, // [array, coordinates]
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const char* title, const char* uname, const char* vname, const char* wname,
+	const char* name_dimx, const char* name_dimy, const char* name_dimz,
+	const T time)
+{
+	FILE* ptr = fopen(filename.c_str(), "w");
+	if (ptr == NULL) return 0;
+
+	fprintf(ptr, " TITLE = \"%s [F(%s,%s,%s)]\"\n",
+		title, name_dimx, name_dimy, name_dimz);
+	fprintf(ptr, " VARIABLES = \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"\n",
+		name_dimx, name_dimy, name_dimz, uname, vname, wname);
+	fprintf(ptr, " ZONE I = %i, J = %i, K = %i, DATAPACKING = POINT, SOLUTIONTIME = %f\n",
+		ie - ib + 1, je - jb + 1, ke - kb + 1, time);
+
+	const int nyz = ny * nz;
+	int i, j, k, idx;
+
+	for (k = kb; k <= ke; k++)
+		for (j = jb; j <= je; j++)
+			for (i = ib; i <= ie; i++)
+			{
+				idx = i * nyz + j * nz + k;
+				fprintf(ptr, "%f %f %f %f %f %f\n", cx[i], cy[j], cz[k],
+					(T) 0.5 * (uout[idx] + uout[idx + nyz]),
+					(T) 0.5 * (vout[idx] + vout[idx + nz]),
+					(T) 0.5 * (wout[idx] + wout[idx + 1]));
+			}
+
+	fclose(ptr);
+	return 1;
+}
+// -------------------------------------------------------------------------------------------- //
+
+// Binary //
+template< typename T >
+int nse::write_binary_stamp(const std::string& filename,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+
+	const T* cx, const T* cy, const T* cz,
+	const T* ex, const T* ey, const T* ez,
+	const GridId<T>& id, const T time)
+{
+	FILE* ptr = fopen(filename.c_str(), "wb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fwrite(id.header, sizeof(int), GridId<T>::hsize, ptr);	// header data //
+	nstatus += fwrite(id.domain, sizeof(T), GridId<T>::dsize, ptr);	// domain definition //
+	nstatus += fwrite(id.grid, sizeof(int), GridId<T>::gsize, ptr);	// grid definition //
+
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	// grid coordinates //
+	nstatus += fwrite(cx, sizeof(T), nx, ptr);
+	nstatus += fwrite(cy, sizeof(T), ny, ptr);
+	nstatus += fwrite(cz, sizeof(T), nz, ptr);
+	nstatus += fwrite(ex, sizeof(T), nx, ptr);
+	nstatus += fwrite(ey, sizeof(T), ny, ptr);
+	nstatus += fwrite(ez, sizeof(T), nz, ptr);
+
+	// time stamp //
+	T time_stamp = time;
+	nstatus += fwrite(&time_stamp, sizeof(T), 1, ptr);
+
+	nstatus += index_stamp.fwrite(ptr);	// index stamp //
+	nstatus += cpu_stamp.fwrite(ptr);	// cpu stamp //
+
+
+	fclose(ptr);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny + nz) + index_stamp.size + cpu_stamp.size + 3;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::write_binary_stamp(const std::string& filename,
+	const binNamedStamp< int >& index_stamp,
+	const binNamedStamp< double >& cpu_stamp,
+
+	const T* cx, const T* cy, const T* cz,
+	const T* ex, const T* ey, const T* ez,
+	const GridId<T>& id, const T time)
+{
+	FILE* ptr = fopen(filename.c_str(), "wb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fwrite(id.header, sizeof(int), GridId<T>::hsize, ptr);	// header data //
+	nstatus += fwrite(id.domain, sizeof(T), GridId<T>::dsize, ptr);	// domain definition //
+	nstatus += fwrite(id.grid, sizeof(int), GridId<T>::gsize, ptr);	// grid definition //
+
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	// grid coordinates //
+	nstatus += fwrite(cx, sizeof(T), nx, ptr);
+	nstatus += fwrite(cy, sizeof(T), ny, ptr);
+	nstatus += fwrite(cz, sizeof(T), nz, ptr);
+	nstatus += fwrite(ex, sizeof(T), nx, ptr);
+	nstatus += fwrite(ey, sizeof(T), ny, ptr);
+	nstatus += fwrite(ez, sizeof(T), nz, ptr);
+
+	// time stamp //
+	T time_stamp = time;
+	nstatus += fwrite(&time_stamp, sizeof(T), 1, ptr);
+
+	nstatus += index_stamp.fwrite(ptr);	// index stamp //
+	nstatus += cpu_stamp.fwrite(ptr);	// cpu stamp //
+
+	fclose(ptr);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny + nz) + index_stamp.get_record_size() + cpu_stamp.get_record_size() + 1;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::write_binary(const std::string& filename,
+	const T* xin, const char* name,
+
+	const T* cx, const T* cy, const T* cz,
+	const T* ex, const T* ey, const T* ez,
+	const GridId<T>& id, const T time)
+{
+	FILE* ptr = fopen(filename.c_str(), "wb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fwrite(id.header, sizeof(int), GridId<T>::hsize, ptr);	// header data //
+	nstatus += fwrite(id.domain, sizeof(T), GridId<T>::dsize, ptr);	// domain definition //
+	nstatus += fwrite(id.grid, sizeof(int), GridId<T>::gsize, ptr);	// grid definition //
+
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	// grid coordinates //
+	nstatus += fwrite(cx, sizeof(T), nx, ptr);
+	nstatus += fwrite(cy, sizeof(T), ny, ptr);
+	nstatus += fwrite(cz, sizeof(T), nz, ptr);
+	nstatus += fwrite(ex, sizeof(T), nx, ptr);
+	nstatus += fwrite(ey, sizeof(T), ny, ptr);
+	nstatus += fwrite(ez, sizeof(T), nz, ptr);
+
+	// time stamp //
+	T time_stamp = time;
+	nstatus += fwrite(&time_stamp, sizeof(T), 1, ptr);
+
+	// field definition //
+	int type = 0;               // scalar field
+	int name_length = strlen(name);
+
+	nstatus += fwrite(&type, sizeof(int), 1, ptr);
+	nstatus += fwrite(&name_length, sizeof(int), 1, ptr);
+	nstatus += fwrite(name, sizeof(char), name_length, ptr);
+
+	// main data //
+	nstatus += fwrite(xin, sizeof(T), nx * ny * nz, ptr);
+
+	fclose(ptr);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny + nz) + nx * ny * nz + name_length + 3;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::write_binary(const std::string& filename,
+	const T* uin, const T* vin, const T* win,
+	const char* uname, const char* vname, const char* wname,
+
+	const T* cx, const T* cy, const T* cz,
+	const T* ex, const T* ey, const T* ez,
+	const GridId<T>& id, const T time)
+{
+	FILE* ptr = fopen(filename.c_str(), "wb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fwrite(id.header, sizeof(int), GridId<T>::hsize, ptr);	// header data //
+	nstatus += fwrite(id.domain, sizeof(T), GridId<T>::dsize, ptr);	// domain definition //
+	nstatus += fwrite(id.grid, sizeof(int), GridId<T>::gsize, ptr);	// grid definition //
+
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	// grid coordinates //
+	nstatus += fwrite(cx, sizeof(T), nx, ptr);
+	nstatus += fwrite(cy, sizeof(T), ny, ptr);
+	nstatus += fwrite(cz, sizeof(T), nz, ptr);
+	nstatus += fwrite(ex, sizeof(T), nx, ptr);
+	nstatus += fwrite(ey, sizeof(T), ny, ptr);
+	nstatus += fwrite(ez, sizeof(T), nz, ptr);
+
+	// time stamp //
+	T time_stamp = time;
+	nstatus += fwrite(&time_stamp, sizeof(T), 1, ptr);
+
+	// field definition //
+	int type = 1;               // vector field
+	int name_length[3];
+	name_length[0] = strlen(uname);
+	name_length[1] = strlen(vname);
+	name_length[2] = strlen(wname);
+
+	nstatus += fwrite(&type, sizeof(int), 1, ptr);
+	nstatus += fwrite(name_length, sizeof(int), 3, ptr);
+	nstatus += fwrite(uname, sizeof(char), name_length[0], ptr);
+	nstatus += fwrite(vname, sizeof(char), name_length[1], ptr);
+	nstatus += fwrite(wname, sizeof(char), name_length[2], ptr);
+
+	// main data //
+	nstatus += fwrite(uin, sizeof(T), nx * ny * nz, ptr);
+	nstatus += fwrite(vin, sizeof(T), nx * ny * nz, ptr);
+	nstatus += fwrite(win, sizeof(T), nx * ny * nz, ptr);
+
+	fclose(ptr);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny + nz) + 3 * nx * ny * nz +
+		name_length[0] + name_length[1] + name_length[2] + 5;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::read_binary_stamp(const std::string& filename,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+
+	T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+	GridId<T>& id, T* time)
+{
+	FILE* ptr = fopen(filename.c_str(), "rb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fread(id.header, sizeof(int), GridId<T>::hsize_r3d, ptr);
+	if (!id.check(3)) {	// check id failed //
+		fclose(ptr);
+		return 0;
+	}
+
+	nstatus += fread_sp(id.domain, id.data_type_size(), GridId<T>::dsize_r3d, ptr);
+	nstatus += fread(id.grid, sizeof(int), GridId<T>::gsize_r3d, ptr);
+
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	// grid coordinates //
+	allocate(cx, cy, cz, nx, ny, nz);
+	allocate(ex, ey, ez, nx, ny, nz);
+
+#ifndef _USE_DEPRECATED_WST_FORMAT
+	nstatus += fread_sp((*cx), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*cy), id.data_type_size(), ny, ptr);
+#endif
+	nstatus += fread_sp((*cz), id.data_type_size(), nz, ptr);
+#ifndef _USE_DEPRECATED_WST_FORMAT
+	nstatus += fread_sp((*ex), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*ey), id.data_type_size(), ny, ptr);
+#endif
+	nstatus += fread_sp((*ez), id.data_type_size(), nz, ptr);
+
+	nstatus += fread_sp(time, id.data_type_size(), 1, ptr);	// time stamp
+
+															// index stamp //
+#ifndef _USE_DEPRECATED_WST_FORMAT
+	nstatus += index_stamp.fread(ptr);
+#else
+	nstatus += index_stamp.fread(ptr, 1);
+#endif
+
+	// cpu stamp //
+	nstatus += cpu_stamp.fread(ptr);
+
+	fclose(ptr);
+	id.reset_data_type_size();	// re-setting data type 
+
+	const int nstatus_check =
+		GridId<T>::hsize_r3d + GridId<T>::dsize_r3d + GridId<T>::gsize_r3d +
+		index_stamp.size + cpu_stamp.size + 2 +
+#ifndef _USE_DEPRECATED_WST_FORMAT
+		2 * (nx + ny + nz) + 1;
+#else
+		2 * nz;
+#endif
+	if (nstatus == nstatus_check) return 1;
+
+	deallocate((*cx), (*cy), (*cz));
+	deallocate((*ex), (*ey), (*ez));
+	return 0;
+}
+
+template< typename T >
+int nse::read_binary_stamp(const std::string& filename,
+	binNamedStamp< int >& index_stamp,
+	binNamedStamp< double >& cpu_stamp,
+
+	T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+	GridId<T>& id, T* time)
+{
+	FILE* ptr = fopen(filename.c_str(), "rb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fread(id.header, sizeof(int), GridId<T>::hsize_r3d, ptr);
+	if (!id.check(3)) {	// check id failed //
+		fclose(ptr);
+		return 0;
+	}
+
+	nstatus += fread_sp(id.domain, id.data_type_size(), GridId<T>::dsize_r3d, ptr);
+	nstatus += fread(id.grid, sizeof(int), GridId<T>::gsize_r3d, ptr);
+
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	// grid coordinates //
+	allocate(cx, cy, cz, nx, ny, nz);
+	allocate(ex, ey, ez, nx, ny, nz);
+
+	nstatus += fread_sp((*cx), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*cy), id.data_type_size(), ny, ptr);
+	nstatus += fread_sp((*cz), id.data_type_size(), nz, ptr);
+
+	nstatus += fread_sp((*ex), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*ey), id.data_type_size(), ny, ptr);
+	nstatus += fread_sp((*ez), id.data_type_size(), nz, ptr);
+
+	nstatus += fread_sp(time, id.data_type_size(), 1, ptr);	// time stamp
+
+	// index stamp //
+	nstatus += index_stamp.fread(ptr);
+	// cpu stamp //
+	nstatus += cpu_stamp.fread(ptr);
+
+	fclose(ptr);
+	id.reset_data_type_size();	// re-setting data type 
+
+	const int nstatus_check =
+		GridId<T>::hsize_r3d + GridId<T>::dsize_r3d + GridId<T>::gsize_r3d +
+		index_stamp.get_record_size() + cpu_stamp.get_record_size() +
+		2 * (nx + ny + nz) + 1;
+	if (nstatus == nstatus_check) return 1;
+
+	deallocate((*cx), (*cy), (*cz));
+	deallocate((*ex), (*ey), (*ez));
+	return 0;
+}
+
+template< typename T >
+int nse::read_binary(const std::string& filename,
+	T** xout, char** name,
+
+	T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+	GridId<T>& id, T* time)
+{
+	FILE* ptr = fopen(filename.c_str(), "rb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fread(id.header, sizeof(int), GridId<T>::hsize_r3d, ptr);
+	if (!id.check(3)) {	// check id failed //
+		fclose(ptr);
+		return 0;
+	}
+
+	nstatus += fread_sp(id.domain, id.data_type_size(), GridId<T>::dsize_r3d, ptr);
+	nstatus += fread(id.grid, sizeof(int), GridId<T>::gsize_r3d, ptr);
+
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	// grid coordinates //
+	allocate(cx, cy, cz, nx, ny, nz);
+	allocate(ex, ey, ez, nx, ny, nz);
+
+#ifndef _USE_DEPRECATED_WST_FORMAT
+	nstatus += fread_sp((*cx), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*cy), id.data_type_size(), ny, ptr);
+#endif
+	nstatus += fread_sp((*cz), id.data_type_size(), nz, ptr);
+#ifndef _USE_DEPRECATED_WST_FORMAT
+	nstatus += fread_sp((*ex), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*ey), id.data_type_size(), ny, ptr);
+#endif
+	nstatus += fread_sp((*ez), id.data_type_size(), nz, ptr);
+
+	nstatus += fread_sp(time, id.data_type_size(), 1, ptr);	// time stamp
+
+															// field definition //
+	int field_type;
+	nstatus += fread(&field_type, sizeof(int), 1, ptr);
+	if (field_type != 0) {	// not scalar field //
+		deallocate((*cx), (*cy), (*cz));
+		deallocate((*ex), (*ey), (*ez));
+		fclose(ptr);
+		return 0;
+	}
+
+	int name_length;
+	nstatus += fread(&name_length, sizeof(int), 1, ptr);
+
+	(*name) = new char[name_length + 1];
+	nstatus += fread((*name), sizeof(char), name_length, ptr);
+	(*name)[name_length] = '\0';
+
+	// main data //
+	allocate(xout, nx * ny * nz);
+	nstatus += fread_sp((*xout), id.data_type_size(), nx * ny * nz, ptr);
+
+	fclose(ptr);
+	id.reset_data_type_size();	// re-setting data type
+
+	const int nstatus_check =
+		GridId<T>::hsize_r3d + GridId<T>::dsize_r3d + GridId<T>::gsize_r3d +
+		nx * ny * nz + name_length + 3 +
+#ifndef _USE_DEPRECATED_WST_FORMAT
+		2 * (nx + ny + nz);
+#else
+		2 * nz;
+#endif
+	if (nstatus == nstatus_check) return 1;
+
+	deallocate((*cx), (*cy), (*cz));
+	deallocate((*ex), (*ey), (*ez));
+	delete[](*name);
+	deallocate((*xout));
+	return 0;
+}
+
+template< typename T >
+int nse::read_binary(const std::string& filename,
+	T** uout, T** vout, T** wout,
+	char** uname, char** vname, char** wname,
+
+	T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+	GridId<T>& id, T* time)
+{
+	FILE* ptr = fopen(filename.c_str(), "rb");
+	if (ptr == NULL) return 0;
+
+	int nstatus = 0;
+	// header, domain & grid id //
+	nstatus += fread(id.header, sizeof(int), GridId<T>::hsize_r3d, ptr);
+	if (!id.check(3)) {	// check id failed //
+		fclose(ptr);
+		return 0;
+	}
+
+	nstatus += fread_sp(id.domain, id.data_type_size(), GridId<T>::dsize_r3d, ptr);
+	nstatus += fread(id.grid, sizeof(int), GridId<T>::gsize_r3d, ptr);
+
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	// grid coordinates //
+	allocate(cx, cy, cz, nx, ny, nz);
+	allocate(ex, ey, ez, nx, ny, nz);
+
+#ifndef _USE_DEPRECATED_WST_FORMAT
+	nstatus += fread_sp((*cx), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*cy), id.data_type_size(), ny, ptr);
+#endif
+	nstatus += fread_sp((*cz), id.data_type_size(), nz, ptr);
+#ifndef _USE_DEPRECATED_WST_FORMAT
+	nstatus += fread_sp((*ex), id.data_type_size(), nx, ptr);
+	nstatus += fread_sp((*ey), id.data_type_size(), ny, ptr);
+#endif
+	nstatus += fread_sp((*ez), id.data_type_size(), nz, ptr);
+
+	nstatus += fread_sp(time, id.data_type_size(), 1, ptr);	// time stamp
+
+															// field definition //
+	int field_type;
+	nstatus += fread(&field_type, sizeof(int), 1, ptr);
+	if (field_type != 1) {	// not vector field //
+		deallocate((*cx), (*cy), (*cz));
+		deallocate((*ex), (*ey), (*ez));
+		fclose(ptr);
+		return 0;
+	}
+
+	int name_length[3];
+	nstatus += fread(name_length, sizeof(int), 3, ptr);
+
+	(*uname) = new char[name_length[0] + 1];
+	(*vname) = new char[name_length[1] + 1];
+	(*wname) = new char[name_length[2] + 1];
+	nstatus += fread((*uname), sizeof(char), name_length[0], ptr);
+	nstatus += fread((*vname), sizeof(char), name_length[1], ptr);
+	nstatus += fread((*wname), sizeof(char), name_length[2], ptr);
+	(*uname)[name_length[0]] = '\0';
+	(*vname)[name_length[1]] = '\0';
+	(*wname)[name_length[2]] = '\0';
+
+	// main data //
+	allocate(uout, vout, wout, nx * ny * nz);
+	nstatus += fread_sp((*uout), id.data_type_size(), nx * ny * nz, ptr);
+	nstatus += fread_sp((*vout), id.data_type_size(), nx * ny * nz, ptr);
+	nstatus += fread_sp((*wout), id.data_type_size(), nx * ny * nz, ptr);
+
+	fclose(ptr);
+	id.reset_data_type_size();	// re-setting data type
+
+	const int nstatus_check =
+		GridId<T>::hsize_r3d + GridId<T>::dsize_r3d + GridId<T>::gsize_r3d +
+		3 * nx * ny * nz + name_length[0] + name_length[1] + name_length[2] + 5 +
+#ifndef _USE_DEPRECATED_WST_FORMAT
+		2 * (nx + ny + nz);
+#else
+		2 * nz;
+#endif
+	if (nstatus == nstatus_check) return 1;
+
+	deallocate((*cx), (*cy), (*cz));
+	deallocate((*ex), (*ey), (*ez));
+	delete[](*uname); delete[](*vname); delete[](*wname);
+	deallocate((*uout), (*vout), (*wout));
+	return 0;
+}
+// -------------------------------------------------------------------------------------------- //
+
+// MPI-I/O 3D datatype //
+template< typename T >
+void nse::mpi_io_write_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+
+	const int mpi_nx, const int mpi_ny, const int mpi_nz,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z)
+{
+	const int mpi_dim_size[3] = { mpi_nx, mpi_ny, mpi_nz };
+	const int dim_size[3] = { nx, ny, nz };
+	const int nghost[3] = { gcx, gcy, gcz };
+	const MPI_Comm comm[3] = { comm_x, comm_y, comm_z };
+
+	mpi_io_write_datatype< T, 3 >(file_view, local_view,
+		mpi_dim_size, dim_size, nghost, comm);
+}
+
+template< typename T >
+void nse::mpi_io_read_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+
+	const int mpi_nx, const int mpi_ny, const int mpi_nz,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z)
+{
+	const int mpi_dim_size[3] = { mpi_nx, mpi_ny, mpi_nz };
+	const int dim_size[3] = { nx, ny, nz };
+	const int nghost[3] = { gcx, gcy, gcz };
+	const MPI_Comm comm[3] = { comm_x, comm_y, comm_z };
+
+	mpi_io_read_datatype< T, 3 >(file_view, local_view,
+		mpi_dim_size, dim_size, nghost, comm);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// MPI-Binary //
+template< typename T >
+int nse::mpi_write_binary(const std::string& filename,
+	const char* mpi_datarep,
+	const MPI_Comm comm, const int header_rank,
+	const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z,
+
+	const T* xin, const char* name,
+
+	const T* cx, const T* cy, const T* cz,
+	const T* ex, const T* ey, const T* ez,
+	const GridId< T >& id, const T time)
+{
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	MPI_Offset header_size = GridId< T >::id_byte_size +
+		2 * (nx + ny + nz) * sizeof(T) +
+		strlen(name) * sizeof(char) +
+		2 * sizeof(int) + sizeof(T);
+
+	MPI_File ptr;
+	int status = MPI_File_open(comm, (char*)filename.c_str(),
+		MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &ptr);
+	if (status != MPI_SUCCESS) return 0;	// MPI file open failure
+
+	int nstatus = 0;
+	int rank;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == header_rank) {	// header
+
+								// header, domain & grid id //
+		status = MPI_File_write(ptr, (void*)id.header, GridId< T >::hsize, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::hsize;
+		status = MPI_File_write(ptr, (void*)id.domain, GridId< T >::dsize, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::dsize;
+		status = MPI_File_write(ptr, (void*)id.grid, GridId< T >::gsize, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::gsize;
+
+		// grid coordinates //
+		status = MPI_File_write(ptr, (void*)cx, nx, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nx;
+		status = MPI_File_write(ptr, (void*)cy, ny, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += ny;
+		status = MPI_File_write(ptr, (void*)cz, nz, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nz;
+		status = MPI_File_write(ptr, (void*)ex, nx, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nx;
+		status = MPI_File_write(ptr, (void*)ey, ny, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += ny;
+		status = MPI_File_write(ptr, (void*)ez, nz, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nz;
+
+		// time stamp //
+		T time_stamp = time;
+		status = MPI_File_write(ptr, &time_stamp, 1, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+
+		// field definition //
+		int type = 0;		// scalar field
+		int name_length = strlen(name);
+
+		status = MPI_File_write(ptr, &type, 1, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+		status = MPI_File_write(ptr, &name_length, 1, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+		status = MPI_File_write(ptr, (void*)name, name_length, MPI_CHAR, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += name_length;
+	}
+	MPI_File_sync(ptr);
+	mpi_broadcast(&nstatus, 1, header_rank, comm);
+
+	int pnx = par_local_size_comm(nx, gcx, comm_x);
+	int pny = par_local_size_comm(ny, gcy, comm_y);
+	int pnz = par_local_size_comm(nz, gcz, comm_z);
+
+	// main data description //
+	MPI_Datatype file_view, local_view;
+	mpi_io_write_datatype< T >(&file_view, &local_view,
+		nx, ny, nz, pnx, pny, pnz, gcx, gcy, gcz,
+		comm_x, comm_y, comm_z);
+
+	// main data //
+	MPI_File_set_view(ptr, header_size, mpi_type< T >(),
+		file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+
+	status = MPI_File_write_all(ptr, (void*)xin, 1, local_view, MPI_STATUS_IGNORE);
+	if (status == MPI_SUCCESS) nstatus += nx * ny * nz;
+
+	MPI_File_close(&ptr);
+	MPI_Type_free(&file_view);
+	MPI_Type_free(&local_view);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny + nz) + nx * ny * nz + strlen(name) + 3;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::mpi_write_binary(const std::string& filename,
+	const char* mpi_datarep,
+	const MPI_Comm comm, const int header_rank,
+	const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z,
+
+	const T* uin, const T* vin, const T* win,
+	const char* uname, const char* vname, const char* wname,
+
+	const T* cx, const T* cy, const T* cz,
+	const T* ex, const T* ey, const T* ez,
+	const GridId< T >& id, const T time)
+{
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	MPI_Offset header_size = GridId< T >::id_byte_size +
+		2 * (nx + ny + nz) * sizeof(T) +
+		(strlen(uname) + strlen(vname) + strlen(wname)) * sizeof(char) +
+		4 * sizeof(int) + sizeof(T);
+
+	MPI_File ptr;
+	int status = MPI_File_open(comm, (char*)filename.c_str(),
+		MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &ptr);
+	if (status != MPI_SUCCESS) return 0;	// MPI file open failure
+
+	int nstatus = 0;
+	int rank;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == header_rank) {	// header
+
+								// header, domain & grid id //
+		status = MPI_File_write(ptr, (void*)id.header, GridId< T >::hsize, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::hsize;
+		status = MPI_File_write(ptr, (void*)id.domain, GridId< T >::dsize, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::dsize;
+		status = MPI_File_write(ptr, (void*)id.grid, GridId< T >::gsize, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId< T >::gsize;
+
+		// grid coordinates //
+		status = MPI_File_write(ptr, (void*)cx, nx, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nx;
+		status = MPI_File_write(ptr, (void*)cy, ny, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += ny;
+		status = MPI_File_write(ptr, (void*)cz, nz, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nz;
+		status = MPI_File_write(ptr, (void*)ex, nx, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nx;
+		status = MPI_File_write(ptr, (void*)ey, ny, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += ny;
+		status = MPI_File_write(ptr, (void*)ez, nz, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += nz;
+
+		// time stamp //
+		T time_stamp = time;
+		status = MPI_File_write(ptr, &time_stamp, 1, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+
+		// field definition //
+		int type = 1;		// vector field
+		int name_length[3];
+		name_length[0] = strlen(uname);
+		name_length[1] = strlen(vname);
+		name_length[2] = strlen(wname);
+
+		status = MPI_File_write(ptr, &type, 1, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+		status = MPI_File_write(ptr, name_length, 3, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 3;
+		status = MPI_File_write(ptr, (void*)uname, name_length[0], MPI_CHAR, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += name_length[0];
+		status = MPI_File_write(ptr, (void*)vname, name_length[1], MPI_CHAR, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += name_length[1];
+		status = MPI_File_write(ptr, (void*)wname, name_length[2], MPI_CHAR, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += name_length[2];
+	}
+	MPI_File_sync(ptr);
+	mpi_broadcast(&nstatus, 1, header_rank, comm);
+
+	int pnx = par_local_size_comm(nx, gcx, comm_x);
+	int pny = par_local_size_comm(ny, gcy, comm_y);
+	int pnz = par_local_size_comm(nz, gcz, comm_z);
+
+	// main data description //
+	MPI_Datatype file_view, local_view;
+	mpi_io_write_datatype< T >(&file_view, &local_view,
+		nx, ny, nz, pnx, pny, pnz, gcx, gcy, gcz,
+		comm_x, comm_y, comm_z);
+
+	// main data //
+	MPI_File_set_view(ptr, header_size, mpi_type< T >(),
+		file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+
+	status = MPI_File_write_all(ptr, (void*)uin, 1, local_view, MPI_STATUS_IGNORE);
+	if (status == MPI_SUCCESS) nstatus += nx * ny * nz;
+	status = MPI_File_write_all(ptr, (void*)vin, 1, local_view, MPI_STATUS_IGNORE);
+	if (status == MPI_SUCCESS) nstatus += nx * ny * nz;
+	status = MPI_File_write_all(ptr, (void*)win, 1, local_view, MPI_STATUS_IGNORE);
+	if (status == MPI_SUCCESS) nstatus += nx * ny * nz;
+
+	MPI_File_close(&ptr);
+	MPI_Type_free(&file_view);
+	MPI_Type_free(&local_view);
+
+	const int nstatus_check = GridId<T>::hsize + GridId<T>::dsize + GridId<T>::gsize +
+		2 * (nx + ny + nz) + 3 * nx * ny * nz +
+		strlen(uname) + strlen(vname) + strlen(wname) + 5;
+	return (nstatus == nstatus_check);
+}
+
+template< typename T >
+int nse::mpi_read_binary(const std::string& filename,
+	const char* mpi_datarep,
+	const MPI_Comm comm, const int header_rank,
+	const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z,
+
+	T** xout, char** name,
+
+	T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+	GridId< T >& id, T* time)
+{
+	MPI_Offset header_size = GridId<T>::id_byte_size_r3d +
+		2 * sizeof(int);
+
+	MPI_File ptr;
+	int status = MPI_File_open(comm, (char*)filename.c_str(),
+		MPI_MODE_RDONLY, MPI_INFO_NULL, &ptr);
+	if (status != MPI_SUCCESS) return 0;	// MPI file open failure
+
+	int name_length;
+	int nstatus = 0;
+	int rank, header_offset = 0, status_id = 0;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == header_rank) {
+
+		// header, domain & grid id //
+		status = MPI_File_read(ptr, id.header, GridId<T>::hsize_r3d, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId<T>::hsize_r3d;
+		if (id.check(3)) {	// check id //
+			status = mpi_fread_sp(ptr, id.domain, GridId<T>::dsize_r3d, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += GridId<T>::dsize_r3d;
+			status = MPI_File_read(ptr, id.grid, GridId<T>::gsize_r3d, MPI_INT, MPI_STATUS_IGNORE);
+			if (status == MPI_SUCCESS) nstatus += GridId<T>::gsize_r3d;
+
+			header_offset += GridId<T>::dsize_r3d *
+				(id.data_type_size() - sizeof(T));	// correcting header size due to grid id
+
+													// grid parameters //
+			int nx, ny, nz, gcx, gcy, gcz;
+			id.grid_dim(1, &nx, &gcx);
+			id.grid_dim(2, &ny, &gcy);
+			id.grid_dim(3, &nz, &gcz);
+
+			// grid coordinates //
+			allocate(cx, cy, cz, nx, ny, nz);
+			allocate(ex, ey, ez, nx, ny, nz);
+
+#ifndef _USE_DEPRECATED_WST_FORMAT
+			status = mpi_fread_sp(ptr, (*cx), nx, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nx;
+			status = mpi_fread_sp(ptr, (*cy), ny, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += ny;
+#endif
+			status = mpi_fread_sp(ptr, (*cz), nz, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nz;
+#ifndef _USE_DEPRECATED_WST_FORMAT
+			status = mpi_fread_sp(ptr, (*ex), nx, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nx;
+			status = mpi_fread_sp(ptr, (*ey), ny, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += ny;
+#endif
+			status = mpi_fread_sp(ptr, (*ez), nz, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nz;
+#ifndef _USE_DEPRECATED_WST_FORMAT
+			header_offset += 2 * (nx + ny + nz) * id.data_type_size();
+#else
+			header_offset += 2 * nz * id.data_type_size();
+#endif
+
+			// time stamp //
+			status = mpi_fread_sp(ptr, time, 1, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += 1;
+			header_offset += id.data_type_size();
+
+			// field definition //
+			int field_type;
+			status = MPI_File_read(ptr, &field_type, 1, MPI_INT, MPI_STATUS_IGNORE);
+			if (status == MPI_SUCCESS) nstatus += 1;
+			if (field_type == 0)	// scalar field //
+			{
+				status = MPI_File_read(ptr, &name_length, 1, MPI_INT, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += 1;
+				header_offset += name_length * sizeof(char);
+
+				(*name) = new char[name_length + 1];
+				status = MPI_File_read(ptr, (*name), name_length, MPI_CHAR, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += name_length;
+				(*name)[name_length] = '\0';
+
+				status_id = 1;
+			}
+			else
+			{
+				deallocate((*cx), (*cy), (*cz));
+				deallocate((*ex), (*ey), (*ez));
+			}
+		}
+	}
+	mpi_broadcast(&status_id, 1, header_rank, comm);
+
+	if (!status_id) {
+		MPI_File_close(&ptr);
+		return 0;
+	}
+	// read status - OK - //
+	id.mpi_broadcast(header_rank, comm);
+
+	mpi_broadcast(&nstatus, 1, header_rank, comm);
+	mpi_broadcast(&name_length, 1, header_rank, comm);
+
+	// correct header size //
+	mpi_broadcast(&header_offset, 1, header_rank, comm);
+	header_size += (MPI_Offset)header_offset;
+
+	// main data description //
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	int pnx = par_local_size_comm(nx, gcx, comm_x);
+	int pny = par_local_size_comm(ny, gcy, comm_y);
+	int pnz = par_local_size_comm(nz, gcz, comm_z);
+
+	MPI_Datatype file_view, local_view;
+	if (id.data_type_size() == sizeof(float)) {	// input=[float]
+		mpi_io_read_datatype< float >(&file_view, &local_view,
+			nx, ny, nz, pnx, pny, pnz, gcx, gcy, gcz,
+			comm_x, comm_y, comm_z);
+
+		MPI_File_set_view(ptr, header_size, mpi_type< float >(),
+			file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+	}
+	if (id.data_type_size() == sizeof(double)) {	// input=[double]
+		mpi_io_read_datatype< double >(&file_view, &local_view,
+			nx, ny, nz, pnx, pny, pnz, gcx, gcy, gcz,
+			comm_x, comm_y, comm_z);
+
+		MPI_File_set_view(ptr, header_size, mpi_type< double >(),
+			file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+	}
+
+	// main data //
+	allocate(xout, pnx * pny * pnz);
+	status = mpi_fread_all_sp(ptr, (*xout), pnx * pny * pnz,
+		local_view, id.data_type_size());
+	if (status == MPI_SUCCESS) nstatus += nx * ny * nz;
+
+	MPI_File_close(&ptr);
+	MPI_Type_free(&file_view);
+	MPI_Type_free(&local_view);
+
+	id.reset_data_type_size();	// re-setting data type
+
+	const int nstatus_check =
+		GridId<T>::hsize_r3d + GridId<T>::dsize_r3d + GridId<T>::gsize_r3d +
+		nx * ny * nz + name_length + 3 +
+#ifndef _USE_DEPRECATED_WST_FORMAT
+		2 * (nx + ny + nz);
+#else
+		2 * nz;
+#endif
+	if (nstatus == nstatus_check) return 1;
+
+	if (rank == header_rank) {
+		deallocate((*cx), (*cy), (*cz));
+		deallocate((*ex), (*ey), (*ez));
+		delete[](*name);
+	}
+	deallocate((*xout));
+	return 0;
+}
+
+template< typename T >
+int nse::mpi_read_binary(const std::string& filename,
+	const char* mpi_datarep,
+	const MPI_Comm comm, const int header_rank,
+	const MPI_Comm comm_x, const MPI_Comm comm_y, const MPI_Comm comm_z,
+
+	T** uout, T** vout, T** wout,
+	char** uname, char** vname, char** wname,
+
+	T** cx, T** cy, T** cz, T** ex, T** ey, T** ez,
+	GridId< T >& id, T* time)
+{
+	MPI_Offset header_size = GridId<T>::id_byte_size_r3d +
+		4 * sizeof(int);
+
+	MPI_File ptr;
+	int status = MPI_File_open(comm, (char*)filename.c_str(),
+		MPI_MODE_RDONLY, MPI_INFO_NULL, &ptr);
+	if (status != MPI_SUCCESS) return 0;	// MPI file open failure
+
+	int name_length[3];
+	int nstatus = 0;
+	int rank, header_offset = 0, status_id = 0;
+	MPI_Comm_rank(comm, &rank);
+	if (rank == header_rank) {
+
+		// header, domain & grid id //
+		status = MPI_File_read(ptr, id.header, GridId<T>::hsize_r3d, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += GridId<T>::hsize_r3d;
+		if (id.check(3)) {	// check id //
+			status = mpi_fread_sp(ptr, id.domain, GridId<T>::dsize_r3d, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += GridId<T>::dsize_r3d;
+			status = MPI_File_read(ptr, id.grid, GridId<T>::gsize_r3d, MPI_INT, MPI_STATUS_IGNORE);
+			if (status == MPI_SUCCESS) nstatus += GridId<T>::gsize_r3d;
+
+			header_offset += GridId<T>::dsize_r3d *
+				(id.data_type_size() - sizeof(T));	// correcting header size due to grid id
+
+													// grid parameters //
+			int nx, ny, nz, gcx, gcy, gcz;
+			id.grid_dim(1, &nx, &gcx);
+			id.grid_dim(2, &ny, &gcy);
+			id.grid_dim(3, &nz, &gcz);
+
+			// grid coordinates //
+			allocate(cx, cy, cz, nx, ny, nz);
+			allocate(ex, ey, ez, nx, ny, nz);
+
+#ifndef _USE_DEPRECATED_WST_FORMAT
+			status = mpi_fread_sp(ptr, (*cx), nx, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nx;
+			status = mpi_fread_sp(ptr, (*cy), ny, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += ny;
+#endif
+			status = mpi_fread_sp(ptr, (*cz), nz, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nz;
+#ifndef _USE_DEPRECATED_WST_FORMAT
+			status = mpi_fread_sp(ptr, (*ex), nx, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nx;
+			status = mpi_fread_sp(ptr, (*ey), ny, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += ny;
+#endif
+			status = mpi_fread_sp(ptr, (*ez), nz, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += nz;
+#ifndef _USE_DEPRECATED_WST_FORMAT
+			header_offset += 2 * (nx + ny + nz) * id.data_type_size();
+#else
+			header_offset += 2 * nz * id.data_type_size();
+#endif
+
+			// time stamp //
+			status = mpi_fread_sp(ptr, time, 1, id.data_type_size());
+			if (status == MPI_SUCCESS) nstatus += 1;
+			header_offset += id.data_type_size();
+
+			// field definition //
+			int field_type;
+			status = MPI_File_read(ptr, &field_type, 1, MPI_INT, MPI_STATUS_IGNORE);
+			if (status == MPI_SUCCESS) nstatus += 1;
+			if (field_type == 1)	// vector field //
+			{
+				status = MPI_File_read(ptr, name_length, 3, MPI_INT, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += 3;
+				header_offset += sizeof(char) *
+					(name_length[0] + name_length[1] + name_length[2]);
+
+				(*uname) = new char[name_length[0] + 1];
+				(*vname) = new char[name_length[1] + 1];
+				(*wname) = new char[name_length[2] + 1];
+				status = MPI_File_read(ptr, (*uname), name_length[0], MPI_CHAR, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += name_length[0];
+				status = MPI_File_read(ptr, (*vname), name_length[1], MPI_CHAR, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += name_length[1];
+				status = MPI_File_read(ptr, (*wname), name_length[2], MPI_CHAR, MPI_STATUS_IGNORE);
+				if (status == MPI_SUCCESS) nstatus += name_length[2];
+				(*uname)[name_length[0]] = '\0';
+				(*vname)[name_length[1]] = '\0';
+				(*wname)[name_length[2]] = '\0';
+
+				status_id = 1;
+			}
+			else
+			{
+				deallocate((*cx), (*cy), (*cz));
+				deallocate((*ex), (*ey), (*ez));
+			}
+		}
+	}
+	mpi_broadcast(&status_id, 1, header_rank, comm);
+
+	if (!status_id) {
+		MPI_File_close(&ptr);
+		return 0;
+	}
+	// read status - OK - //
+	id.mpi_broadcast(header_rank, comm);
+
+	mpi_broadcast(&nstatus, 1, header_rank, comm);
+	mpi_broadcast(name_length, 3, header_rank, comm);
+
+	// correct header size //
+	mpi_broadcast(&header_offset, 1, header_rank, comm);
+	header_size += (MPI_Offset)header_offset;
+
+	// main data description //
+	int nx, ny, nz, gcx, gcy, gcz;
+	id.grid_dim(1, &nx, &gcx);
+	id.grid_dim(2, &ny, &gcy);
+	id.grid_dim(3, &nz, &gcz);
+
+	int pnx = par_local_size_comm(nx, gcx, comm_x);
+	int pny = par_local_size_comm(ny, gcy, comm_y);
+	int pnz = par_local_size_comm(nz, gcz, comm_z);
+
+	MPI_Datatype file_view, local_view;
+	if (id.data_type_size() == sizeof(float)) {	// input=[float]
+		mpi_io_read_datatype< float >(&file_view, &local_view,
+			nx, ny, nz, pnx, pny, pnz, gcx, gcy, gcz,
+			comm_x, comm_y, comm_z);
+
+		MPI_File_set_view(ptr, header_size, mpi_type< float >(),
+			file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+	}
+	if (id.data_type_size() == sizeof(double)) {	// input=[double]
+		mpi_io_read_datatype< double >(&file_view, &local_view,
+			nx, ny, nz, pnx, pny, pnz, gcx, gcy, gcz,
+			comm_x, comm_y, comm_z);
+
+		MPI_File_set_view(ptr, header_size, mpi_type< double >(),
+			file_view, (char*)mpi_datarep, MPI_INFO_NULL);
+	}
+
+	// main data //
+	allocate(uout, vout, wout, pnx * pny * pnz);
+	status = mpi_fread_all_sp(ptr, (*uout), pnx * pny * pnz,
+		local_view, id.data_type_size());
+	if (status == MPI_SUCCESS) nstatus += nx * ny * nz;
+	status = mpi_fread_all_sp(ptr, (*vout), pnx * pny * pnz,
+		local_view, id.data_type_size());
+	if (status == MPI_SUCCESS) nstatus += nx * ny * nz;
+	status = mpi_fread_all_sp(ptr, (*wout), pnx * pny * pnz,
+		local_view, id.data_type_size());
+	if (status == MPI_SUCCESS) nstatus += nx * ny * nz;
+
+	MPI_File_close(&ptr);
+	MPI_Type_free(&file_view);
+	MPI_Type_free(&local_view);
+
+	id.reset_data_type_size();	// re-setting data type
+
+	const int nstatus_check =
+		GridId<T>::hsize_r3d + GridId<T>::dsize_r3d + GridId<T>::gsize_r3d +
+		3 * nx * ny * nz + name_length[0] + name_length[1] + name_length[2] + 5 +
+#ifndef _USE_DEPRECATED_WST_FORMAT
+		2 * (nx + ny + nz);
+#else
+		2 * nz;
+#endif
+	if (nstatus == nstatus_check) return 1;
+
+	if (rank == header_rank) {
+		deallocate((*cx), (*cy), (*cz));
+		deallocate((*ex), (*ey), (*ez));
+		delete[](*uname); delete[](*vname); delete[](*wname);
+	}
+	deallocate((*uout), (*vout), (*wout));
+	return 0;
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/io-misc.h b/io-misc.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bf23bf6b26ae8534abc14c6a16e6097be4af829
--- /dev/null
+++ b/io-misc.h
@@ -0,0 +1,190 @@
+#pragma once
+
+// [io-base3d.h]: I/O: 3D array
+//
+// -------------------------------------------------------------------------------------------- //
+
+#define _CRT_SECURE_NO_DEPRECATE
+#include <stdio.h>
+
+#include "nse-alloc.h"
+
+namespace nse
+{
+	template< typename T >
+	const char* c_io_fmt();
+
+	template< typename T, typename Tin >
+	int fread_sp(T* out, const int size, FILE* ptr);
+	template< typename T >
+	int fread_sp(T* out, 
+		const int FILE_dtype_size, const int size, FILE* ptr);
+
+
+	template< typename T, typename Tin >
+	int mpi_fread_sp(MPI_File ptr, T* out, const int size, MPI_Status* mpi_status);
+	template< typename T >
+	int mpi_fread_sp(MPI_File ptr, T* out, const int size, 
+		const int FILE_dtype_size, MPI_Status* mpi_status);
+
+	template< typename T, typename Tin >
+	int mpi_fread_sp(MPI_File ptr, T* out, const int size);
+	template< typename T >
+	int mpi_fread_sp(MPI_File ptr, T* out, const int size,
+		const int FILE_dtype_size);
+
+
+	template< typename T, typename Tin >
+	int mpi_fread_all_sp(MPI_File ptr, T* out, const int esize,
+		MPI_Datatype dview,
+		MPI_Status* mpi_status);
+	template< typename T >
+	int mpi_fread_all_sp(MPI_File ptr, T* out, const int esize,
+		MPI_Datatype dview,
+		const int FILE_dtype_size, MPI_Status* mpi_status);
+
+	template< typename T, typename Tin >
+	int mpi_fread_all_sp(MPI_File ptr, T* out, const int esize,
+		MPI_Datatype dview);
+	template< typename T >
+	int mpi_fread_all_sp(MPI_File ptr, T* out, const int esize,
+		MPI_Datatype dview,
+		const int FILE_dtype_size);
+}
+
+namespace nse
+{
+	template< > inline const char* c_io_fmt< int >() { return " %i "; }
+	template< > inline const char* c_io_fmt< float >() { return " %f "; }
+	template< > inline const char* c_io_fmt< double >() { return " %lf "; }
+	template< > inline const char* c_io_fmt< long double >() { return " %lf "; }
+}
+
+
+template< typename T, typename Tin >
+int nse::fread_sp(T* out, const int size, FILE* ptr)
+{
+	Tin* buf;
+	int buf_id = memStx::get_buf(&buf, size);
+
+	int status = fread(buf, sizeof(Tin), size, ptr);
+	int csize = (size == status) ? size : status;
+	for (int k = 0; k < csize; k++)
+		out[k] = (T)buf[k];
+
+	memStx::free_buf(buf_id);
+	return status;
+}
+
+template< typename T >
+int nse::fread_sp(T* out, const int FILE_dtype_size,
+	const int size, FILE* ptr)
+{
+	if (sizeof(T) == FILE_dtype_size)
+		return fread(out, sizeof(T), size, ptr);
+
+	if (FILE_dtype_size == sizeof(float))	// input=[float]
+		return fread_sp<T, float>(out, size, ptr);
+	if (FILE_dtype_size == sizeof(double))	// input=[double]
+		return fread_sp<T, double>(out, size, ptr);
+
+	return 0;
+}
+
+template< typename T, typename Tin >
+int nse::mpi_fread_sp(MPI_File ptr, T* out, const int size,
+	MPI_Status* mpi_status)
+{
+	Tin* buf;
+	int buf_id = memStx::get_buf(&buf, size);
+
+	int status = MPI_File_read(ptr, buf, size, mpi_type<Tin>(), mpi_status);
+	if (status == MPI_SUCCESS) {
+		for (int k = 0; k < size; k++)
+			out[k] = (T)buf[k];
+	}
+
+	memStx::free_buf(buf_id);
+	return status;
+}
+
+template< typename T >
+int nse::mpi_fread_sp(MPI_File ptr, T* out, const int size,
+	const int FILE_dtype_size, MPI_Status* mpi_status)
+{
+	if (sizeof(T) == FILE_dtype_size)
+		return MPI_File_read(ptr, out, size, mpi_type<T>(), mpi_status);
+
+	if (FILE_dtype_size == sizeof(float))	// input=[float]
+		return mpi_fread_sp<T, float>(ptr, out, size, mpi_status);
+	if (FILE_dtype_size == sizeof(double))	// input=[double]
+		return mpi_fread_sp<T, double>(ptr, out, size, mpi_status);
+
+	return MPI_ERR_TYPE;
+}
+
+template< typename T, typename Tin >
+int nse::mpi_fread_sp(MPI_File ptr, T* out, const int size)
+{
+	return mpi_fread_sp<T, Tin>(ptr, out, size,
+		MPI_STATUS_IGNORE);
+}
+
+template< typename T >
+int nse::mpi_fread_sp(MPI_File ptr, T* out, const int size,
+	const int FILE_dtype_size)
+{
+	return mpi_fread_sp<T>(ptr, out, size,
+		FILE_dtype_size, MPI_STATUS_IGNORE);
+}
+
+template< typename T, typename Tin >
+int nse::mpi_fread_all_sp(MPI_File ptr, T* out, const int esize,
+	MPI_Datatype dview,
+	MPI_Status* mpi_status)
+{
+	Tin* buf;
+	int buf_id = memStx::get_buf(&buf, esize);
+
+	int status = MPI_File_read_all(ptr, buf, 1, dview, mpi_status);
+	if (status == MPI_SUCCESS) {
+		for (int k = 0; k < esize; k++)
+			out[k] = (T)buf[k];
+	}
+
+	memStx::free_buf(buf_id);
+	return status;
+}
+
+template< typename T >
+int nse::mpi_fread_all_sp(MPI_File ptr, T* out, const int esize,
+	MPI_Datatype dview,
+	const int FILE_dtype_size, MPI_Status* mpi_status)
+{
+	if (sizeof(T) == FILE_dtype_size)
+		return MPI_File_read_all(ptr, out, 1, dview, mpi_status);
+
+	if (FILE_dtype_size == sizeof(float))	// input=[float]
+		return mpi_fread_all_sp<T, float>(ptr, out, esize, dview, mpi_status);
+	if (FILE_dtype_size == sizeof(double))	// input=[double]
+		return mpi_fread_all_sp<T, double>(ptr, out, esize, dview, mpi_status);
+
+	return MPI_ERR_TYPE;
+}
+
+template< typename T, typename Tin >
+int nse::mpi_fread_all_sp(MPI_File ptr, T* out, const int esize,
+	MPI_Datatype dview)
+{
+	return mpi_fread_all_sp<T, Tin>(ptr, out, esize, dview,
+		MPI_STATUS_IGNORE);
+}
+
+template< typename T >
+int nse::mpi_fread_all_sp(MPI_File ptr, T* out, const int esize,
+	MPI_Datatype dview,
+	const int FILE_dtype_size)
+{
+	return mpi_fread_all_sp<T>(ptr, out, esize, dview,
+		FILE_dtype_size, MPI_STATUS_IGNORE);
+}
diff --git a/license.txt b/license.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c422fd36c2f2929ba4b9d3f6327c66fb695e3b06
--- /dev/null
+++ b/license.txt
@@ -0,0 +1,23 @@
+Terms of use and licence agreement
+------------------------------------------------------------------------------
+
+The nse-library is defined as a collection of the following source code libraries: nselibx-common, nselibx-unigrid, nselibx-stgrid, nselibx-wstgrid, visualRTL, nse-matlab, nse-pseq, nse-pbin.
+
+The use of nse-library code and any parts of it with or without modifications, binary forms requires direct permission of the code authors and copyright holders for each project/collaboration. Definition of project/collaboration is provided by the code authors individually and on case by case basis.
+
+The distribution, rent, lease, lend or any commercial use of nse-library code and any parts of it with or without modifications, binary forms requires written permission of the code authors and copyright holders.
+
+In case of violation of the terms of use the rights to use software or any data obtained by use of nse-library source code may be revoked.
+
+Unauthorized use, copying of software is expressly forbidden. 
+
+The same conditions and terms of use are implied for any source code or binary forms dependent in any way on nse-library source code (including, but not limited to: nse-couette-dns project).
+
+In case all appropriate permissions on use of source code is obtained this software is provided by the authors, copyright holders and contributors "as is" and any express or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. In no event shall the code authors, copyright owners or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory or liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage.
+------------------------------------------------------------------------------
+
+
+Copyright (c) 2011-2021, Evgeny Mortikov
+All rights reserved.
+Contact e-mail: 
+evgeny.mortikov@gmail.com
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4969ccb9ea761854685e13c59aae895877552dd
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,142 @@
+#include "model-defines.h"
+#include "model-obj.h"
+
+// [main.cpp]:
+//
+// -------------------------------------------------------------------------------------------- //
+
+void display_help()
+{
+	printf("\n >> Printing command line keys:\n");
+	printf(" -uinit\n");
+	printf("\t init model with flow fields in startup directory\n");
+	printf(" -udump [N]\n");
+	printf("\t start model from dump with index [N]\n");
+	printf(" -ltime [N]\n");
+	printf("\t specify time limit in [N] minutes\n");
+	printf(" -------------------------------------------------------------\n");
+}
+
+int main(int argc, char** argv)
+{
+	modelObj<Real, memRUN> model;
+
+	// Init parallel
+	// -------------------------------------------------------------------------------------------- //
+	memStx::init();
+	if (!modelObj<Real, memRUN>::init_parallel(
+		argc, argv, MPI_THREAD_FUNNELED))
+	{
+		memStx::finalize();
+		return 0;
+	}
+
+	int mpi_rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+	if (mpi_rank == 0)
+		printf(" >> MPI-Init: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+	// Display help
+	// -------------------------------------------------------------------------------------------- //
+	if ((argc == 2) && (!strcmp(argv[1], "--help"))) 
+	{
+		if (mpi_rank == 0) display_help();
+		
+		modelObj<Real, memRUN>::finalize_parallel();
+		memStx::finalize();
+		return 0;
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	if (mpi_rank == 0)
+		printf("\n >> Model setup ...\n");
+	// -------------------------------------------------------------------------------------------- //
+#ifdef USE_CONFIG
+	if (!model.setup(argc, argv, USE_CONFIG, MPI_COMM_WORLD))
+#else
+	if (!model.setup(argc, argv, MPI_COMM_WORLD))
+#endif
+	{
+		modelObj<Real, memRUN>::finalize_parallel();
+		memStx::finalize();
+		return 0;
+	}
+
+	if (mpi_rank == 0)
+		printf("\n >> Initial conditions ...\n");
+	// -------------------------------------------------------------------------------------------- //
+	if (!model.init_flow()) {
+		modelObj<Real, memRUN>::finalize_parallel();
+		memStx::finalize();
+		return 0;
+	}
+
+	// Processing print dump mode ...
+	// -------------------------------------------------------------------------------------------- //
+	if (model.is_print_dump_mode()) {
+		model.print_dump_to_output();
+
+		model.clear();
+		modelObj<Real, memRUN>::finalize_parallel();
+		memStx::finalize();
+		return 0;
+	}
+
+	if (mpi_rank == 0)
+		printf(" \n >> Setting equations ...\n");
+	// -------------------------------------------------------------------------------------------- //
+	if (!model.init_eq()) {
+		modelObj<Real, memRUN>::finalize_parallel();
+		memStx::finalize();
+		return 0;
+	}
+
+#ifdef _USE_DEPRECATED_WST_FORMAT
+	// Re-writing [WST] format dump:
+	// -------------------------------------------------------------------------------------------- //
+	model.rewrite_restart_dump();
+#endif
+
+	// ------------------------------------------------------------------ //
+	if (mpi_rank == 0)
+		printf("\n >> Running model ...\n\n");
+
+	bool status = true;
+	while (model.is_active())
+	{
+		// ------------------------------------------------- //
+#ifdef INCLUDE_PARTICLES
+		if (!model.advance_particles()) { status = false; break; }
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+		if (!model.advance_track_particles()) { status = false; break; }
+#endif
+
+#ifdef STRATIFICATION   // Note!: Explicit coupling (using U(n) not U(n+1))
+		if (!model.advance_heat_eq()) { status = false; break; }
+#endif
+		if (!model.advance_nse_eq()) { status = false; break; }
+
+		if (!model.advance_time()) { status = false; break; }
+		// ------------------------------------------------- //
+
+
+		// ------------------------------------------------- //
+		// write output &  dump
+		// ------------------------------------------------- //
+		model.advance_output();
+		model.advance_dump();
+#ifdef INCLUDE_VISUALRTL
+		model.advance_visualization();
+#endif
+		// ------------------------------------------------- //
+	}
+	// ------------------------------------------------------------------ //
+	model.complete(status);
+
+	model.clear();
+	modelObj<Real, memRUN>::finalize_parallel();
+	memStx::finalize();
+	return 0;
+}
diff --git a/makefile b/makefile
new file mode 100644
index 0000000000000000000000000000000000000000..19551ec371a0990c653fd41cda64a57dc6c36afd
--- /dev/null
+++ b/makefile
@@ -0,0 +1,373 @@
+NSE = nsenx
+MACHINE ?= local
+COMPILER ?= gnu
+
+
+# Machine values:
+#  chebyshev, lomonosov, lomonosov_2
+#  inmregular, inm6core, inmavx, inmavx2
+#  mvs10p, mvs1p5, mvs10q
+#  taito_snb, taito_hsw, sisu, 
+#  puhti, puhti_avx512, puhti_cascade
+#  theta
+#  xeonphi
+#  local (default)
+ifeq (,$(filter $(MACHINE),chebyshev lomonosov lomonosov_2 \
+	inmregular inm6core inmavx inmavx2 \
+	mvs10p mvs1p5 mvs10q \
+	taito_snb taito_hsw sisu puhti puhti_avx512 puhti_cascade \
+	theta \
+	xeonphi \
+        kunpeng\
+	local))
+
+  $(error incorrect MACHINE value = $(MACHINE))
+endif
+# ---------------------------------------------------------------
+
+# Compiler values:
+#  gnu
+#  cray
+#  intel, intel_15 (default), intel_16, intel_17, intel_18, intel_19
+ifeq (,$(filter $(COMPILER),gnu cray intel intel_15 intel_16 intel_17 intel_18 intel_19))
+  
+  $(error incorrect COMPILER value = $(COMPILER))
+endif
+
+# ---------------------------------------------------------------
+
+# Assuming:
+#  intel compiler >= 13.0.0 (default template argument support)
+#  gnu compiler checked >= 4.4.6
+#  cray compiler checked = 8.5.6
+# ---------------------------------------------------------------
+
+
+# Preprocessing >
+# ---------------------------------------------------------------
+IS_INTEL_COMPILER = $(if $(filter $(COMPILER),intel intel_15 intel_16 intel_17 intel_18 intel_19),true,false)
+IS_INTEL_COMPILER_GE_15 = $(if $(filter $(COMPILER),intel_15 intel_16 intel_17 intel_18 intel_19),true,false)
+IS_INTEL_COMPILER_GE_16 = $(if $(filter $(COMPILER),intel_16 intel_17 intel_18 intel_19),true,false)
+
+IS_LOCAL_ARCH = $(if $(filter $(MACHINE),local),true,false)
+IS_SSE41_ARCH = $(if $(filter $(MACHINE),chebyshev),true,false)
+IS_SSE42_ARCH = $(if $(filter $(MACHINE),inm6core lomonosov),true,false)
+IS_AVX_ARCH = $(if $(filter $(MACHINE),inmavx mvs10p taito_snb),true,false)
+IS_AVX2_ARCH = $(if $(filter $(MACHINE),inmavx2 mvs1p5 mvs10q lomonosov_2 taito_hsw sisu puhti),true,false)
+IS_AVX512_ARCH = $(if $(filter $(MACHINE),puhti_avx512),true,false)
+IS_CASCADE_LAKE_ARCH = $(if $(filter $(MACHINE),puhti_cascade),true,false)
+IS_MIC_ARCH = $(if $(filter $(MACHINE),xeonphi),true,false)
+IS_KNL_ARCH = $(if $(filter $(MACHINE),theta),true,false)
+IS_KUNPENG_ARCH = $(if $(filter $(MACHINE),kunpeng),true,false)
+# ---------------------------------------------------------------
+
+# define $(CC) >
+# ---------------------------------------------------------------
+ifeq ($(IS_INTEL_COMPILER),true)
+  CC = mpiicpc
+  ifneq (,$(filter $(MACHINE),chebyshev lomonosov lomonosov_2 puhti puhti_avx512 puhti_cascade))
+    CC = mpicxx
+  endif
+  ifeq ($(MACHINE),sisu)
+    CC = CC
+  endif
+  ifeq ($(MACHINE),theta)
+    CC = CC
+  endif
+endif
+ifeq ($(COMPILER),gnu)
+  CC = mpicxx
+  ifeq ($(MACHINE),theta)
+    CC = CC
+  endif
+endif
+ifeq ($(COMPILER),cray)
+  CC = CC
+endif
+# ---------------------------------------------------------------
+
+# define OpenMP key $(CC_OPENMP) >
+# ---------------------------------------------------------------
+ifeq ($(IS_INTEL_COMPILER),true)
+  CC_OPENMP = -openmp
+  ifeq ($(IS_INTEL_COMPILER_GE_16),true)
+    CC_OPENMP = -qopenmp
+  endif
+endif
+ifeq ($(COMPILER),gnu)
+  CC_OPENMP = -fopenmp
+endif
+ifeq ($(IS_INTEL_COMPILER),true)
+  CC_OPENMP_LINK = -openmp
+  ifeq ($(IS_INTEL_COMPILER_GE_16),true)
+    CC_OPENMP_LINK = -qopenmp
+  endif
+endif
+ifeq ($(COMPILER),gnu)
+  CC_OPENMP_LINK = -fopenmp
+endif
+# ---------------------------------------------------------------
+
+# define optimization keys >
+# ---------------------------------------------------------------
+ifeq ($(IS_INTEL_COMPILER),true)
+  CC_OPTIMIZE = -O3 -restrict -alias-const -parallel -funroll-loops
+  ifeq ($(IS_INTEL_COMPILER_GE_15),true)
+    CC_OPTIMIZE += -no-ansi-alias
+  endif
+
+  ifeq ($(IS_MIC_ARCH),true)
+    ifeq ($(IS_INTEL_COMPILER_GE_16),true)
+      CC_OPTIMIZE += -qopt-streaming-cache-evict=1 -qopt-streaming-stores always
+    else
+      CC_OPTIMIZE += -opt-streaming-cache-evict=1 -opt-streaming-stores always
+    endif
+  endif
+
+  ifeq ($(IS_KNL_ARCH),true)
+    ifeq ($(IS_INTEL_COMPILER_GE_16),true)
+      CC_OPTIMIZE += -qopt-streaming-stores always
+    else
+      CC_OPTIMIZE += -opt-streaming-stores always
+    endif
+  endif
+endif
+ifeq ($(COMPILER),gnu)
+  CC_OPTIMIZE = -O3 -fno-strict-aliasing -funroll-loops
+endif
+ifeq ($(COMPILER),cray)
+  CC_OPTIMIZE = -O3
+endif
+
+CC_LINK_OPTIMIZE = -O3
+# ---------------------------------------------------------------
+
+# define architecture keys >
+# ---------------------------------------------------------------
+CC_ARCH = 
+CC_LINK_ARCH = 
+
+ifeq ($(IS_SSE41_ARCH),true)
+  CC_ARCH = -msse4.1
+  ifeq ($(IS_INTEL_COMPILER),true)
+    CC_ARCH += -xSSE4.1
+  endif
+endif
+ifeq ($(IS_SSE42_ARCH),true)
+  CC_ARCH = -msse4.2
+  ifeq ($(IS_INTEL_COMPILER),true)
+    CC_ARCH += -xSSE4.2
+  endif
+endif
+ifeq ($(IS_AVX_ARCH),true)
+  CC_ARCH = -mavx
+  ifeq ($(IS_INTEL_COMPILER),true)
+    CC_ARCH += -xavx
+  endif
+endif
+ifeq ($(IS_AVX2_ARCH),true)
+  ifeq ($(IS_INTEL_COMPILER),true)
+    CC_ARCH = -march=core-avx2 -xcore-avx2
+  endif
+  ifeq ($(COMPILER),gnu)
+    CC_ARCH = -mavx2
+  endif
+endif
+ifeq ($(IS_AVX512_ARCH),true)
+  ifeq ($(IS_INTEL_COMPILER),true)
+    CC_ARCH = -xcore-avx512
+  endif
+  ifeq ($(COMPILER),gnu)
+    CC_ARCH = -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
+  endif
+endif
+ifeq ($(IS_CASCADE_LAKE_ARCH),true)
+  ifeq ($(IS_INTEL_COMPILER),true)
+    CC_ARCH = -march=cascadelake -xcascadelake
+  endif
+  ifeq ($(COMPILER),gnu)
+    CC_ARCH = -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
+  endif
+endif
+ifeq ($(IS_MIC_ARCH),true)
+  CC_ARCH = -mmic
+  CC_LINK_ARCH = -mmic
+endif
+ifeq ($(IS_KNL_ARCH),true)
+  CC_ARCH = -xmic-avx512
+  CC_LINK_ARCH = -xmic-avx512
+endif
+
+ifeq ($(IS_KUNPENG_ARCH),true)
+  ifeq ($(IS_INTEL_COMPILER),true)
+    CC_ARCH = -xhost
+  endif
+  ifeq ($(COMPILER),gnu)
+    CC_ARCH = -march=armv8.2-a+rdma+lse -fomit-frame-pointer
+  endif
+endif
+
+ifeq ($(IS_LOCAL_ARCH),true)
+  ifeq ($(IS_INTEL_COMPILER),true)
+    CC_ARCH = -xhost
+  endif
+  ifeq ($(COMPILER),gnu)
+    CC_ARCH = -march=native
+  endif
+endif
+# ---------------------------------------------------------------
+
+# define MPI options >
+# ---------------------------------------------------------------
+CC_MPI_OPTIONS =
+
+ifeq ($(IS_MIC_ARCH),true)
+  CC_MPI_OPTIONS += -mt_mpi -static_mpi
+endif
+# ---------------------------------------------------------------
+
+# define linking libraries >
+# ---------------------------------------------------------------
+ifeq ($(IS_INTEL_COMPILER),true)
+  CC_LINK_LIB = -limf
+endif
+# ---------------------------------------------------------------
+
+# define float precision model >
+# ---------------------------------------------------------------
+CC_FP_MODEL =
+# ---------------------------------------------------------------
+
+# define misc options
+# ---------------------------------------------------------------
+# to print all defined macros use:
+# -E -dM - < /dev/null
+#
+CC_MISC = 
+ifeq ($(COMPILER), gnu)
+  CC_MISC += -std=c++0x
+endif
+# ---------------------------------------------------------------
+
+
+#Define compiler output on functions
+# ---------------------------------------------------------------
+
+CC_COMPILE = $(CC) -c $(CC_MPI_OPTIONS) $(CC_OPENMP) $(CC_OPTIMIZE) $(CC_ARCH) $(CC_FP_MODEL) $(CC_MISC)
+CC_LINK = $(CC) $(CC_LINK_ARCH) $(CC_MPI_OPTIONS) $(CC_OPENMP_LINK) $(CC_LINK_OPTIMIZE) $(CC_LINK_LIB)
+
+
+OBJ_BASE = mem-stx.o str-com.o mtrand.o mpi-com3d.o time-series.o time-slice.o time-slice3d.o cfg-var.o config-parser.o
+OBJ_NSE = nse-io3d.o nse-generic3d.o nse-fops3d-x2.o nse-fops3d-x4.o nse-bc3d.o nse3d.o nse3d-x4.o
+OBJ_POIS = pois-bc3d.o pois-base3d.o pois-base3d-x4.o mg-data3d.o pois-gs3d.o pois-sor3d.o pois-mg-base3d.o pois-mg3d.o pois3d.o pois3d-x4.o
+OBJ_PTCL = ptcl-vec3d.o ptcl-track-vec3d.o traj3d.o traj-file-handle.o traj-accum3d.o
+
+OBJ_MODEL = stats-data.o model-obj.o main.o 
+
+OBJ = $(OBJ_BASE) $(OBJ_NSE) $(OBJ_POIS) $(OBJ_PTCL) $(OBJ_MODEL)
+
+
+$(NSE): $(OBJ)
+	$(CC_LINK) $(OBJ) -o $(NSE)
+
+mem-stx.o: mem-stx.cpp mem-stx.h
+	$(CC_COMPILE) mem-stx.cpp
+
+str-com.o: str-com.cpp str-com.h
+	$(CC_COMPILE) str-com.cpp
+
+mtrand.o: mtrand.cpp mtrand.h
+	$(CC_COMPILE) mtrand.cpp
+
+mpi-com3d.o: mpi-com3d.cpp mpi-com3d.h mpi-com.h
+	$(CC_COMPILE) mpi-com3d.cpp
+
+time-series.o: time-series.cpp time-series.h
+	$(CC_COMPILE) time-series.cpp
+
+time-slice.o: time-slice.cpp time-slice.h
+	$(CC_COMPILE) time-slice.cpp
+
+time-slice3d.o: time-slice3d.cpp time-slice3d.h
+	$(CC_COMPILE) time-slice3d.cpp
+
+cfg-var.o: cfg-var.cpp cfg-var.h
+	$(CC_COMPILE) cfg-var.cpp
+
+config-parser.o: config-parser.cpp config-parser.h
+	$(CC_COMPILE) config-parser.cpp
+
+nse-io3d.o: nse-io3d.cpp nse-io3d.h
+	$(CC_COMPILE) nse-io3d.cpp
+
+nse-generic3d.o: nse-generic3d.cpp nse-generic3d.h
+	$(CC_COMPILE) nse-generic3d.cpp
+
+nse-fops3d-x2.o: nse-fops3d-x2.cpp nse-fops3d-x2.h
+	$(CC_COMPILE) nse-fops3d-x2.cpp
+
+nse-fops3d-x4.o: nse-fops3d-x4.cpp nse-fops3d-x4.h
+	$(CC_COMPILE) nse-fops3d-x4.cpp
+
+ptcl-vec3d.o: ptcl-vec3d.cpp ptcl-vec3d.h
+	$(CC_COMPILE) ptcl-vec3d.cpp
+
+ptcl-track-vec3d.o: ptcl-track-vec3d.cpp ptcl-track-vec3d.h
+	$(CC_COMPILE) ptcl-track-vec3d.cpp
+
+traj3d.o: traj3d.cpp traj3d.h
+	$(CC_COMPILE) traj3d.cpp
+
+traj-file-handle.o: traj-file-handle.cpp traj-file-handle.h
+	$(CC_COMPILE) traj-file-handle.cpp
+
+traj-accum3d.o: traj-accum3d.cpp traj-accum3d.h
+	$(CC_COMPILE) traj-accum3d.cpp
+
+nse-bc3d.o: nse-bc3d.cpp nse-bc3d.h
+	$(CC_COMPILE) nse-bc3d.cpp
+
+nse3d.o: nse3d.cpp nse3d.h
+	$(CC_COMPILE) nse3d.cpp
+
+nse3d-x4.o: nse3d-x4.cpp nse3d-x4.h
+	$(CC_COMPILE) nse3d-x4.cpp
+
+pois-bc3d.o: pois-bc3d.cpp pois-bc3d.h
+	$(CC_COMPILE) pois-bc3d.cpp
+
+pois-base3d.o: pois-base3d.cpp pois-base3d.h
+	$(CC_COMPILE) pois-base3d.cpp
+
+pois-base3d-x4.o: pois-base3d-x4.cpp pois-base3d-x4.h
+	$(CC_COMPILE) pois-base3d-x4.cpp
+
+mg-data3d.o: mg-data3d.cpp mg-data3d.h
+	$(CC_COMPILE) mg-data3d.cpp
+
+pois-gs3d.o: pois-gs3d.cpp pois-gs3d.h
+	$(CC_COMPILE) pois-gs3d.cpp
+
+pois-sor3d.o: pois-sor3d.cpp pois-sor3d.h
+	$(CC_COMPILE) pois-sor3d.cpp
+
+pois-mg-base3d.o: pois-mg-base3d.cpp pois-mg-base3d.h
+	$(CC_COMPILE) pois-mg-base3d.cpp
+
+pois-mg3d.o: pois-mg3d.cpp pois-mg3d.h
+	$(CC_COMPILE) pois-mg3d.cpp
+
+pois3d.o: pois3d.cpp pois3d.h
+	$(CC_COMPILE) pois3d.cpp
+
+pois3d-x4.o: pois3d-x4.cpp pois3d-x4.h
+	$(CC_COMPILE) pois3d-x4.cpp
+
+stats-data.o: stats-data.cpp stats-data.h
+	$(CC_COMPILE) stats-data.cpp
+
+model-obj.o: model-obj.h model-eq.hpp model-init.hpp model-out.hpp model-setup.hpp model-stats.hpp model-user.hpp
+	$(CC_COMPILE) model-obj.cpp
+
+main.o: main.cpp model-bc.h model-const.h model-default.h model-defines.h model-obj.h
+	$(CC_COMPILE) main.cpp
diff --git a/mem-stx.cpp b/mem-stx.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bcab1dfe28048b88643406b22b04501b80d4c0a
--- /dev/null
+++ b/mem-stx.cpp
@@ -0,0 +1,94 @@
+#include "mem-stx.h"
+#include "nse-alloc.h"
+
+#ifndef EXCLUDE_GPU_BRANCH
+#include "cuda-stx.cuh"
+#endif
+
+void* nse::memStx::buf[];
+
+int nse::memStx::buf_size[] = { 0 };
+int nse::memStx::buf_status[] = { 0 };
+// --------------------------------------------------------------------------
+
+
+// --------------------------------------------------------------------------
+void nse::memStx::init()
+{
+	for (int k = 0; k < nbuf; k++) {
+		buf_size[k] = 0;
+		buf_status[k] = 0;
+	}
+
+#ifndef EXCLUDE_GPU_BRANCH
+	nse_gpu::cudaStx::init();
+#endif
+}
+
+void nse::memStx::finalize()
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	nse_gpu::cudaStx::finalize();
+#endif
+
+	for (int k = 0; k < nbuf; k++) {
+		deallocate_buf(k);
+	}
+}
+// --------------------------------------------------------------------------
+
+// --------------------------------------------------------------------------
+template< typename T >
+int nse::memStx::get_buf(T** buf_ptr, const int size)
+{
+	int k = 0;
+	while (k != nbuf) {
+		if (buf_status[k] == 0) break;
+		k++;
+	}
+	if (k == nbuf) return -1;	// error //
+
+	allocate_buf(k, size * sizeof(T));
+	buf_status[k] = 1;
+
+	(*buf_ptr) = (T*)buf[k];
+	return k;
+}
+
+void nse::memStx::free_buf(const int id)
+{
+	buf_status[id] = 0;
+}
+// --------------------------------------------------------------------------
+
+// --------------------------------------------------------------------------
+void nse::memStx::allocate_buf(const int buf_id, const int nbytes)
+{
+	if (nbytes > buf_size[buf_id]) {
+		deallocate_buf(buf_id);
+
+		buf_size[buf_id] = nbytes;
+		allocate_void(&buf[buf_id], buf_size[buf_id]);
+	}
+}
+
+void nse::memStx::deallocate_buf(const int buf_id)
+{
+	if (buf_size[buf_id] > 0) {
+		deallocate_void(buf[buf_id]);
+
+		buf_size[buf_id] = 0;
+		buf_status[buf_id] = 0;
+	}
+}
+// --------------------------------------------------------------------------
+
+
+
+// Initialization: cudaStx
+// --------------------------------------------------------------------------
+template int nse::memStx::get_buf(char**, const int);
+template int nse::memStx::get_buf(int**, const int);
+template int nse::memStx::get_buf(float**, const int);
+template int nse::memStx::get_buf(double**, const int);
+// --------------------------------------------------------------------------
diff --git a/mem-stx.h b/mem-stx.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab77ffa0b410a3e55e92d2a140bb277d0fd346fc
--- /dev/null
+++ b/mem-stx.h
@@ -0,0 +1,41 @@
+#pragma once
+
+// [mem-stx.h]: static memory buffers
+//
+// -------------------------------------------------------------------------------------------- //
+
+// memory storage class
+// --------------------------------------------------------------------------
+namespace nse
+{
+	class memStx {
+	public:
+
+		// Init-Finalize Buffer Memory Usage
+		// --------------------------------------------------------------------------
+		static void init();
+		static void finalize();
+		// --------------------------------------------------------------------------
+
+		// Buffer Handlers
+		// --------------------------------------------------------------------------
+		template< typename T >
+		static int get_buf(T** buf_ptr, const int size);
+
+		static void free_buf(const int id);
+		// --------------------------------------------------------------------------
+
+	private:
+		static const int nbuf = 32;
+
+		static void* buf[nbuf];
+		static int buf_size[nbuf];
+		static int buf_status[nbuf];
+
+		static void allocate_buf(const int buf_id, const int nbytes);
+		static void deallocate_buf(const int buf_id);
+
+		memStx() {}
+	};
+}
+// ----------------------------------------------------------------------------------------- //
diff --git a/mg-data3d.cpp b/mg-data3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..877edb2eafbe91e701b823bcc40a795ac61adeda
--- /dev/null
+++ b/mg-data3d.cpp
@@ -0,0 +1,733 @@
+#include "mg-data3d.h"
+#include "pois-base3d.h"
+
+using namespace nse::nse_const3d;
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+
+template< typename T >
+void nse::mg_poisson3d_data< T > ::init(
+	const nse::wstGrid3d< T >& grid, const int _num_grids,
+	const int smooth_down_iters,
+	const int smooth_up_iters,
+	const int smooth_direct_iters,
+	const T smooth_up_omega,
+	const T smooth_up_omega_fine,
+	const poissonTypeBC pois_bc_type)
+{
+	const int fine = 0;
+	int mpi_pcz, mpi_ncz;
+
+	clear();
+	num_grids = _num_grids;
+
+	// control parameters for definition of coarsening type //
+	T dz_min = grid.dz_min, dz_max = grid.dz_max;
+
+	x[fine] = NULL;       // defined in mg solver
+	rhs[fine] = NULL;     // defined in mg solver
+	allocate_vnull(&idg[fine], grid.nz);
+
+	nx[fine] = grid.nx;
+	ny[fine] = grid.ny;
+	nz[fine] = grid.nz;
+	gcx[fine] = grid.gcx;
+	gcy[fine] = grid.gcy;
+	gcz[fine] = grid.gcz;
+
+	dx[fine] = grid.dx;
+	dy[fine] = grid.dy;
+	dz[fine] = grid.dz;
+
+	dzi[fine] = grid.dzi;
+
+	dzp[fine] = grid.dzp; dzm[fine] = grid.dzm;
+	dzpi[fine] = grid.dzpi; dzmi[fine] = grid.dzmi;
+
+	dx2i[fine] = grid.dx2i;
+	dy2i[fine] = grid.dy2i;
+	dzp2i[fine] = grid.dzp2i; dzm2i[fine] = grid.dzm2i;
+
+	poisson3d::set_diagonal_inverse(idg[fine],
+		nz[fine],
+
+		gcz[fine], nz[fine] - gcz[fine] - 1,
+
+		dx2i[fine], dy2i[fine],
+		dzp2i[fine], dzm2i[fine]);
+
+	grid.mpi_com.exchange_halo_z(idg[fine], 1, 1, nz[fine],
+		0, 0, gcz[fine], 0, 0, gcz[fine], 0);
+
+	sm_down_iters[fine] = smooth_down_iters;
+	sm_up_iters[fine] = smooth_up_iters;
+
+	int offset_x = grid.mpi_com.offset_x(nx[fine], gcx[fine]) + gcx[fine];
+	int offset_y = grid.mpi_com.offset_y(ny[fine], gcy[fine]) + gcy[fine];
+	int offset_z = grid.mpi_com.offset_z(nz[fine], gcz[fine]) + gcz[fine];
+	sm_color_shift[fine] = ((offset_x + offset_y + offset_z) & 1);
+
+	sm_up_omega[fine] = smooth_up_omega_fine;
+
+	coarse_type[fine] = axisXYZ;
+
+	// boundary type setup //
+	bc.init(pois_bc_type,
+		grid.mpi_com.rank_x, grid.mpi_com.rank_y, grid.mpi_com.rank_z,
+		grid.mpi_com.size_x, grid.mpi_com.size_y, grid.mpi_com.size_z);
+
+	mg_memory_size = grid.nz;
+	for (int i = fine + 1; i < num_grids; i++) {
+
+		gcx[i] = gcx[i - 1];
+		gcy[i] = gcy[i - 1];
+		gcz[i] = gcz[i - 1];
+
+		int shx = ((nx[i - 1] - (gcx[i - 1] << 1)) & 1);
+		int shy = ((ny[i - 1] - (gcy[i - 1] << 1)) & 1);
+		int shz = ((nz[i - 1] - (gcz[i - 1] << 1)) & 1);
+
+		// - define coarse type
+		coarse_type[i] = define_mg_coarsening(
+			dx[i - 1], dy[i - 1], dz_min, dz_max, (T)mg_coarse_aspect);
+
+		// -x coarsening //
+		if ((coarse_type[i] == axisXYZ) || (coarse_type[i] == axisXY) ||
+			(coarse_type[i] == axisXZ) || (coarse_type[i] == axisX))
+		{
+			nx[i] = ((nx[i - 1] - (gcx[i - 1] << 1)) >> 1) + (gcx[i] << 1) + shx;
+
+			dx[i] = (T) 2.0 * dx[i - 1];
+			dx2i[i] = (T) 0.25 * dx2i[i - 1];
+		}
+		else
+		{
+			nx[i] = (nx[i - 1] - (gcx[i - 1] << 1)) + (gcx[i] << 1);
+
+			dx[i] = dx[i - 1];
+			dx2i[i] = dx2i[i - 1];
+		}
+
+		// -y coarsening //
+		if ((coarse_type[i] == axisXYZ) || (coarse_type[i] == axisXY) ||
+			(coarse_type[i] == axisYZ) || (coarse_type[i] == axisY))
+		{
+			ny[i] = ((ny[i - 1] - (gcy[i - 1] << 1)) >> 1) + (gcy[i] << 1) + shy;
+
+			dy[i] = (T) 2.0 * dy[i - 1];
+			dy2i[i] = (T) 0.25 * dy2i[i - 1];
+		}
+		else
+		{
+			ny[i] = (ny[i - 1] - (gcy[i - 1] << 1)) + (gcy[i] << 1);
+
+			dy[i] = dy[i - 1];
+			dy2i[i] = dy2i[i - 1];
+		}
+
+		// -z coarsening //
+		if ((coarse_type[i] == axisXYZ) || (coarse_type[i] == axisXZ) ||
+			(coarse_type[i] == axisYZ) || (coarse_type[i] == axisZ))
+		{
+			nz[i] = ((nz[i - 1] - (gcz[i - 1] << 1)) >> 1) + (gcz[i] << 1) + shz;
+
+			dz_min *= (T) 2.0; dz_max *= (T) 2.0;
+		}
+		else
+		{
+			nz[i] = (nz[i - 1] - (gcz[i - 1] << 1)) + (gcz[i] << 1);
+		}
+
+		if (i < num_grids - 1) {
+			sm_down_iters[i] = smooth_down_iters;
+			sm_up_iters[i] = smooth_up_iters;
+		}
+		else
+		{
+			sm_down_iters[i] = smooth_direct_iters;
+			sm_up_iters[i] = smooth_direct_iters;	// UNDEF - not used //
+		}
+
+		sm_up_omega[i] = smooth_up_omega;
+
+		allocate_vnull(&dz[i], &dzi[i], nz[i]);
+		allocate_vnull(&dzp[i], &dzm[i], nz[i]);
+		allocate_vnull(&dzpi[i], &dzmi[i], nz[i]);
+		allocate_vnull(&dzp2i[i], &dzm2i[i], nz[i]);
+
+		mg_memory_size += 8 * nz[i];
+
+		// define -z steps //
+		if ((coarse_type[i] == axisXYZ) || (coarse_type[i] == axisYZ) ||
+			(coarse_type[i] == axisXZ) || (coarse_type[i] == axisZ))
+		{
+			mpi_pcz = (grid.mpi_com.rank_z == 0) ? gcz[i] : 0;
+			mpi_ncz = (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1) ? gcz[i] : 0;
+
+			for (int k = gcz[i]; k < nz[i] - gcz[i]; k++)
+				dz[i][k] = dz[i - 1][2 * (k - gcz[i]) + gcz[i - 1]] +
+				dz[i - 1][2 * (k - gcz[i]) + gcz[i - 1] + 1];
+
+			grid.mpi_com.exchange_halo_z(dz[i], 1, 1, nz[i], 0, 0, gcz[i], 0, 0, gcz[i], 0);
+			for (int k = mpi_pcz - 1; k >= 0; k--)
+				dz[i][k] = dz[i][k + 1];
+			for (int k = nz[i] - mpi_ncz; k < nz[i]; k++)
+				dz[i][k] = dz[i][k - 1];
+
+			for (int k = 0; k < nz[i] - 1; k++)
+				dzp[i][k] = dz[i][k] + dz[i][k + 1];
+			for (int k = 1; k < nz[i]; k++)
+				dzm[i][k] = dz[i][k] + dz[i][k - 1];
+
+			grid.mpi_com.exchange_halo_z(dzp[i], 1, 1, nz[i], 0, 0, gcz[i], 0, 0, gcz[i], 0);
+			grid.mpi_com.exchange_halo_z(dzm[i], 1, 1, nz[i], 0, 0, gcz[i], 0, 0, gcz[i], 0);
+			if (mpi_ncz)
+				dzp[i][nz[i] - 1] = (T) 2.0 * dz[i][nz[i] - 1];
+			if (mpi_pcz)
+				dzm[i][0] = (T) 2.0 * dz[i][0];
+
+			for (int k = 0; k < nz[i]; k++) {
+				dzi[i][k] = (T) 1.0 / dz[i][k];
+
+				dzpi[i][k] = (T) 1.0 / dzp[i][k];
+				dzmi[i][k] = (T) 1.0 / dzm[i][k];
+
+				dzp2i[i][k] = (T) 2.0 * dzpi[i][k] * dzi[i][k];
+				dzm2i[i][k] = (T) 2.0 * dzmi[i][k] * dzi[i][k];
+			}
+		}
+		if ((coarse_type[i] == axisXY) ||
+			(coarse_type[i] == axisX) || (coarse_type[i] == axisY))
+		{
+			memcpy(dz[i], dz[i - 1], nz[i] * sizeof(T));
+			memcpy(dzp[i], dzp[i - 1], nz[i] * sizeof(T));
+			memcpy(dzm[i], dzm[i - 1], nz[i] * sizeof(T));
+
+			memcpy(dzi[i], dzi[i - 1], nz[i] * sizeof(T));
+			memcpy(dzpi[i], dzpi[i - 1], nz[i] * sizeof(T));
+			memcpy(dzmi[i], dzmi[i - 1], nz[i] * sizeof(T));
+			memcpy(dzp2i[i], dzp2i[i - 1], nz[i] * sizeof(T));
+			memcpy(dzm2i[i], dzm2i[i - 1], nz[i] * sizeof(T));
+		}
+		// ---------------------------------------------------------------------- //
+
+		int offset_x = grid.mpi_com.offset_x(nx[i], gcx[i]) + gcx[i];
+		int offset_y = grid.mpi_com.offset_y(ny[i], gcy[i]) + gcy[i];
+		int offset_z = grid.mpi_com.offset_z(nz[i], gcz[i]) + gcz[i];
+		sm_color_shift[i] = ((offset_x + offset_y + offset_z) & 1);
+
+		allocate_vnull(&x[i], &rhs[i], nx[i] * ny[i] * nz[i]);
+		allocate_vnull(&idg[i], nz[i]);
+
+		poisson3d::set_diagonal_inverse(idg[i],
+			nz[i],
+
+			gcz[i], nz[i] - gcz[i] - 1,
+
+			dx2i[i], dy2i[i],
+			dzp2i[i], dzm2i[i]);
+
+		grid.mpi_com.exchange_halo_z(idg[i], 1, 1, nz[i],
+			0, 0, gcz[i], 0, 0, gcz[i], 0);
+
+		mg_memory_size += 2 * nx[i] * ny[i] * nz[i] + nz[i];
+	}
+}
+
+template< typename T >
+void nse::mg_poisson3d_data< T > ::clear()
+{
+#ifdef MEASURE_MG_RUN_TIME
+	for (int i = 0; i < mg_max_grids; i++) {	// null timers //
+		run_time[i] = (double)0;
+		smooth_time[i] = (double)0;
+		restrict_time[i] = (double)0;
+		prolongate_time[i] = (double)0;
+	}
+#endif
+
+	const int fine = 0;
+	if ((num_grids == 0) || (mg_memory_size == 0)) return;
+
+	deallocate(idg[fine]);
+
+	for (int i = fine + 1; i < num_grids; i++) {
+		deallocate(dz[i], dzi[i]);
+		deallocate(dzp[i], dzm[i]);
+		deallocate(dzpi[i], dzmi[i]);
+		deallocate(dzp2i[i], dzm2i[i]);
+
+		deallocate(x[i], rhs[i]);
+		deallocate(idg[i]);
+	}
+
+	num_grids = 0;
+	mg_memory_size = 0;
+}
+// ------------------------------------------------------------------------ //
+
+template< typename T >
+void nse::mg_mpi_poisson3d_data< T > ::init(
+	const nse::wstGrid3d< T >& grid, const int _num_grids,
+	const int smooth_down_iters,
+	const int smooth_up_iters,
+	const int smooth_direct_iters,
+	const T smooth_up_omega,
+	const T smooth_up_omega_fine,
+	const poissonTypeBC pois_bc_type)
+{
+	const int fine = 0;
+	int mpi_pcz, mpi_ncz;
+
+	clear();
+	num_grids = _num_grids;
+
+	// control parameters for definition of coarsening type //
+	T dz_min = grid.dz_min, dz_max = grid.dz_max;
+
+	x[fine] = NULL;       // defined in mg solver
+	rhs[fine] = NULL;     // defined in mg solver
+	allocate_vnull(&idg[fine], grid.nz);
+
+	mpi_run[fine] = 1;
+	mpi_level[fine] = 1;
+	mpi_combine[fine] = 0;
+
+	mpi_com[fine].copy(grid.mpi_com);
+
+	local_nx[fine] = grid.nx;
+	local_ny[fine] = grid.ny;
+	local_nz[fine] = grid.nz;
+	mpi_nx[fine] = grid.nx;
+	mpi_ny[fine] = grid.ny;
+	mpi_nz[fine] = grid.nz;
+
+	gcx[fine] = grid.gcx;
+	gcy[fine] = grid.gcy;
+	gcz[fine] = grid.gcz;
+
+	dx[fine] = grid.dx;
+	dy[fine] = grid.dy;
+	dz[fine] = grid.dz;
+
+	dzi[fine] = grid.dzi;
+
+	dzp[fine] = grid.dzp; dzm[fine] = grid.dzm;
+	dzpi[fine] = grid.dzpi; dzmi[fine] = grid.dzmi;
+
+	dx2i[fine] = grid.dx2i;
+	dy2i[fine] = grid.dy2i;
+	dzp2i[fine] = grid.dzp2i; dzm2i[fine] = grid.dzm2i;
+
+	poisson3d::set_diagonal_inverse(idg[fine],
+		mpi_nz[fine],
+
+		gcz[fine], mpi_nz[fine] - gcz[fine] - 1,
+
+		dx2i[fine], dy2i[fine],
+		dzp2i[fine], dzm2i[fine]);
+
+	mpi_com[fine].exchange_halo_z(idg[fine], 1, 1, mpi_nz[fine],
+		0, 0, gcz[fine], 0, 0, gcz[fine], 0);
+
+	sm_down_iters[fine] = smooth_down_iters;
+	sm_up_iters[fine] = smooth_up_iters;
+
+
+	int offset_x = mpi_com[fine].offset_x(mpi_nx[fine], gcx[fine]) + gcx[fine];
+	int offset_y = mpi_com[fine].offset_y(mpi_ny[fine], gcy[fine]) + gcy[fine];
+	int offset_z = mpi_com[fine].offset_z(mpi_nz[fine], gcz[fine]) + gcz[fine];
+	sm_color_shift[fine] = ((offset_x + offset_y + offset_z) & 1);
+
+	sm_up_omega[fine] = smooth_up_omega_fine;
+
+	coarse_type[fine] = axisXYZ;
+
+	// boundary type setup on fine grid //
+	bc[fine].init(pois_bc_type,
+		mpi_com[fine].rank_x, mpi_com[fine].rank_y, mpi_com[fine].rank_z,
+		mpi_com[fine].size_x, mpi_com[fine].size_y, mpi_com[fine].size_z);
+
+
+	// grid-coefficients modified by omega value //
+	allocate_vnull(&idg_omega[fine], mpi_nz[fine]);
+	memcpy(idg_omega[fine], idg[fine], mpi_nz[fine] * sizeof(T));
+	mul(idg_omega[fine], sm_up_omega[fine], mpi_nz[fine]);
+
+
+	mg_memory_size = 2 * grid.nz;
+	for (int i = fine + 1; i < num_grids; i++) {
+
+		mpi_run[i] = mpi_run[i - 1];
+		mpi_level[i] = mpi_level[i - 1];
+		mpi_combine[i] = 0;
+
+		mpi_com[i].copy(mpi_com[i - 1]);
+
+		gcx[i] = gcx[i - 1];
+		gcy[i] = gcy[i - 1];
+		gcz[i] = gcz[i - 1];
+
+		local_nx[i] = (mpi_nx[i - 1] - (gcx[i - 1] << 1)) + (gcx[i] << 1);
+		local_ny[i] = (mpi_ny[i - 1] - (gcy[i - 1] << 1)) + (gcy[i] << 1);
+		local_nz[i] = (mpi_nz[i - 1] - (gcz[i - 1] << 1)) + (gcz[i] << 1);
+
+		int shx = ((mpi_nx[i - 1] - (gcx[i - 1] << 1)) & 1);
+		int shy = ((mpi_ny[i - 1] - (gcy[i - 1] << 1)) & 1);
+		int shz = ((mpi_nz[i - 1] - (gcz[i - 1] << 1)) & 1);
+
+		coarse_type[i] = axisXYZ;
+
+		// - define coarse type
+		// - divide only on running processors
+		if (mpi_run[i]) {
+			coarse_type[i] = define_mg_coarsening(
+				dx[i - 1], dy[i - 1], dz_min, dz_max, (T)mg_coarse_aspect);
+
+			// -x coarsening //
+			if ((coarse_type[i] == axisXYZ) || (coarse_type[i] == axisXY) ||
+				(coarse_type[i] == axisXZ) || (coarse_type[i] == axisX))
+			{
+				local_nx[i] = ((mpi_nx[i - 1] - (gcx[i - 1] << 1)) >> 1) + (gcx[i] << 1) + shx;
+
+				dx[i] = (T) 2.0 * dx[i - 1];
+				dx2i[i] = (T) 0.25 * dx2i[i - 1];
+			}
+			else
+			{
+				local_nx[i] = (mpi_nx[i - 1] - (gcx[i - 1] << 1)) + (gcx[i] << 1);
+
+				dx[i] = dx[i - 1];
+				dx2i[i] = dx2i[i - 1];
+			}
+
+			// -y coarsening //
+			if ((coarse_type[i] == axisXYZ) || (coarse_type[i] == axisXY) ||
+				(coarse_type[i] == axisYZ) || (coarse_type[i] == axisY))
+			{
+				local_ny[i] = ((mpi_ny[i - 1] - (gcy[i - 1] << 1)) >> 1) + (gcy[i] << 1) + shy;
+
+				dy[i] = (T) 2.0 * dy[i - 1];
+				dy2i[i] = (T) 0.25 * dy2i[i - 1];
+			}
+			else
+			{
+				local_ny[i] = (mpi_ny[i - 1] - (gcy[i - 1] << 1)) + (gcy[i] << 1);
+
+				dy[i] = dy[i - 1];
+				dy2i[i] = dy2i[i - 1];
+			}
+
+			// -z coarsening //
+			if ((coarse_type[i] == axisXYZ) || (coarse_type[i] == axisXZ) ||
+				(coarse_type[i] == axisYZ) || (coarse_type[i] == axisZ))
+			{
+				local_nz[i] = ((mpi_nz[i - 1] - (gcz[i - 1] << 1)) >> 1) + (gcz[i] << 1) + shz;
+
+				dz_min *= (T) 2.0; dz_max *= (T) 2.0;
+			}
+			else
+			{
+				local_nz[i] = (mpi_nz[i - 1] - (gcz[i - 1] << 1)) + (gcz[i] << 1);
+			}
+
+			if (i < num_grids - 1) {
+				sm_down_iters[i] = smooth_down_iters;
+				sm_up_iters[i] = smooth_up_iters;
+			}
+			else
+			{
+				sm_down_iters[i] = smooth_direct_iters;
+				sm_up_iters[i] = smooth_direct_iters;	// UNDEF - not used //
+			}
+
+			sm_up_omega[i] = smooth_up_omega;
+		}
+
+		mpi_nx[i] = local_nx[i];
+		mpi_ny[i] = local_ny[i];
+		mpi_nz[i] = local_nz[i];
+
+		// get max task size for all processors //
+		int max_processor_size = mpi_run[i] * sizeof(T)*
+			((local_nx[i] - 2 * gcx[i]) * (local_ny[i] - 2 * gcy[i]) * (local_nz[i] - 2 * gcz[i]));
+		nse::mpi_allreduce(&max_processor_size, MPI_MAX, grid.mpi_com.comm);
+
+		// if task size small enough: begin divide-and-conquer //
+		if ((max_processor_size <= mg_mpi_min_proc_size) &&
+			(mpi_run[i]) && (mpi_com[i].size > 1))
+		{
+			mpi_level[i] *= 2;
+			mpi_combine[i] = mpi_level[i];
+
+			mpi_run[i] = (((grid.mpi_com.rank_x % mpi_level[i]) == 0) &&
+				((grid.mpi_com.rank_y % mpi_level[i]) == 0) &&
+				((grid.mpi_com.rank_z % mpi_level[i]) == 0));
+
+			mpi_com[i].split_comm(mpi_com[i - 1], 2, 2, 2);
+
+			int rank_sh = (mpi_level[i] >> 1);
+			if (mpi_run[i]) {
+				// get size addition from adjacent processors
+
+				int acx = 0, acy = 0, acz = 0;
+				if (grid.mpi_com.rank_x + rank_sh < grid.mpi_com.size_x)
+					MPI_Recv(&acx, 1, MPI_INT, grid.mpi_com.rank + rank_sh,
+						0, grid.mpi_com.comm, MPI_STATUS_IGNORE);
+
+				if (grid.mpi_com.rank_y + rank_sh < grid.mpi_com.size_y)
+					MPI_Recv(&acy, 1, MPI_INT, grid.mpi_com.rank + rank_sh * grid.mpi_com.size_x,
+						0, grid.mpi_com.comm, MPI_STATUS_IGNORE);
+
+				if (grid.mpi_com.rank_z + rank_sh < grid.mpi_com.size_z)
+					MPI_Recv(&acz, 1, MPI_INT, grid.mpi_com.rank + rank_sh * grid.mpi_com.size_x * grid.mpi_com.size_y,
+						0, grid.mpi_com.comm, MPI_STATUS_IGNORE);
+
+				mpi_nx[i] += acx;
+				mpi_ny[i] += acy;
+				mpi_nz[i] += acz;
+			}
+			else
+			{
+				int acx = local_nx[i] - 2 * gcx[i];
+				int acy = local_ny[i] - 2 * gcy[i];
+				int acz = local_nz[i] - 2 * gcz[i];
+
+				if ((grid.mpi_com.rank_x - rank_sh >= 0) &&
+					((grid.mpi_com.rank_x % mpi_level[i]) == rank_sh) &&
+					((grid.mpi_com.rank_y % mpi_level[i]) == 0) &&
+					((grid.mpi_com.rank_z % mpi_level[i]) == 0))
+				{
+					// send acx //
+					MPI_Send(&acx, 1, MPI_INT, grid.mpi_com.rank - rank_sh,
+						0, grid.mpi_com.comm);
+				}
+				if ((grid.mpi_com.rank_y - rank_sh >= 0) &&
+					((grid.mpi_com.rank_x % mpi_level[i]) == 0) &&
+					((grid.mpi_com.rank_y % mpi_level[i]) == rank_sh) &&
+					((grid.mpi_com.rank_z % mpi_level[i]) == 0))
+				{
+					// send acy //
+					MPI_Send(&acy, 1, MPI_INT, grid.mpi_com.rank - rank_sh * grid.mpi_com.size_x,
+						0, grid.mpi_com.comm);
+				}
+				if ((grid.mpi_com.rank_z - rank_sh >= 0) &&
+					((grid.mpi_com.rank_x % mpi_level[i]) == 0) &&
+					((grid.mpi_com.rank_y % mpi_level[i]) == 0) &&
+					((grid.mpi_com.rank_z % mpi_level[i]) == rank_sh))
+				{
+					// send acz //
+					MPI_Send(&acz, 1, MPI_INT, grid.mpi_com.rank - rank_sh * grid.mpi_com.size_x * grid.mpi_com.size_y,
+						0, grid.mpi_com.comm);
+				}
+
+				mpi_nx[i] -= acx;
+				mpi_ny[i] -= acy;
+				mpi_nz[i] -= acz;
+			}
+		}
+
+		if (mpi_run[i]) {
+
+			allocate_vnull(&dz[i], &dzi[i], mpi_nz[i]);
+			allocate_vnull(&dzp[i], &dzm[i], mpi_nz[i]);
+			allocate_vnull(&dzpi[i], &dzmi[i], mpi_nz[i]);
+			allocate_vnull(&dzp2i[i], &dzm2i[i], mpi_nz[i]);
+
+			allocate_vnull(&x[i], &rhs[i], mpi_nx[i] * mpi_ny[i] * mpi_nz[i]);
+			allocate_vnull(&idg[i], mpi_nz[i]);
+
+			mg_memory_size += 2 * mpi_nx[i] * mpi_ny[i] * mpi_nz[i] +
+				9 * mpi_nz[i];
+
+			// grid-coefficients modified by omega value //
+			allocate_vnull(&idg_omega[i], mpi_nz[i]);
+
+			mg_memory_size += mpi_nz[i];
+
+			int offset_x = mpi_com[i].offset_x(mpi_nx[i], gcx[i]) + gcx[i];
+			int offset_y = mpi_com[i].offset_y(mpi_ny[i], gcy[i]) + gcy[i];
+			int offset_z = mpi_com[i].offset_z(mpi_nz[i], gcz[i]) + gcz[i];
+			sm_color_shift[i] = ((offset_x + offset_y + offset_z) & 1);
+		}
+		if ((mpi_combine[i]) && (!mpi_run[i])) {
+
+			allocate_vnull(&dz[i], &dzi[i], local_nz[i]);
+			allocate_vnull(&dzp[i], &dzm[i], local_nz[i]);
+			allocate_vnull(&dzpi[i], &dzmi[i], local_nz[i]);
+			allocate_vnull(&dzp2i[i], &dzm2i[i], local_nz[i]);
+
+			allocate_vnull(&x[i], &rhs[i], local_nx[i] * local_ny[i] * local_nz[i]);
+			allocate_vnull(&idg[i], local_nz[i]);
+
+			mg_memory_size += 2 * local_nx[i] * local_ny[i] * local_nz[i] +
+				9 * local_nz[i];
+
+			// grid-coefficients modified by omega value //
+			allocate_vnull(&idg_omega[i], local_nz[i]);
+
+			mg_memory_size += local_nz[i];
+		}
+
+		if ((mpi_run[i]) || ((!mpi_run[i]) && (mpi_combine[i])))
+		{
+			mpi_pcz = (mpi_com[i - 1].rank_z == 0) ? gcz[i] : 0;
+			mpi_ncz = (mpi_com[i - 1].rank_z == mpi_com[i - 1].size_z - 1) ? gcz[i] : 0;
+
+			// define -z steps //
+			if ((coarse_type[i] == axisXYZ) || (coarse_type[i] == axisYZ) ||
+				(coarse_type[i] == axisXZ) || (coarse_type[i] == axisZ))
+			{
+
+				for (int k = gcz[i]; k < local_nz[i] - gcz[i]; k++)
+					dz[i][k] = dz[i - 1][2 * (k - gcz[i]) + gcz[i - 1]] +
+					dz[i - 1][2 * (k - gcz[i]) + gcz[i - 1] + 1];
+
+				mpi_com[i - 1].exchange_halo_z(dz[i], 1, 1, local_nz[i], 0, 0, gcz[i], 0, 0, gcz[i], 0);
+				for (int k = mpi_pcz - 1; k >= 0; k--)
+					dz[i][k] = dz[i][k + 1];
+				for (int k = local_nz[i] - mpi_ncz; k < local_nz[i]; k++)
+					dz[i][k] = dz[i][k - 1];
+
+				for (int k = 0; k < local_nz[i] - 1; k++)
+					dzp[i][k] = dz[i][k] + dz[i][k + 1];
+				for (int k = 1; k < local_nz[i]; k++)
+					dzm[i][k] = dz[i][k] + dz[i][k - 1];
+
+				mpi_com[i - 1].exchange_halo_z(dzp[i], 1, 1, local_nz[i], 0, 0, gcz[i], 0, 0, gcz[i], 0);
+				mpi_com[i - 1].exchange_halo_z(dzm[i], 1, 1, local_nz[i], 0, 0, gcz[i], 0, 0, gcz[i], 0);
+				if (mpi_ncz)
+					dzp[i][local_nz[i] - 1] = (T) 2.0 * dz[i][local_nz[i] - 1];
+				if (mpi_pcz)
+					dzm[i][0] = (T) 2.0 * dz[i][0];
+
+				for (int k = 0; k < local_nz[i]; k++) {
+					dzi[i][k] = (T) 1.0 / dz[i][k];
+
+					dzpi[i][k] = (T) 1.0 / dzp[i][k];
+					dzmi[i][k] = (T) 1.0 / dzm[i][k];
+
+					dzp2i[i][k] = (T) 2.0 * dzpi[i][k] * dzi[i][k];
+					dzm2i[i][k] = (T) 2.0 * dzmi[i][k] * dzi[i][k];
+				}
+			}
+			if ((coarse_type[i] == axisXY) ||
+				(coarse_type[i] == axisY) || (coarse_type[i] == axisX))
+			{
+				memcpy(dz[i], dz[i - 1], local_nz[i] * sizeof(T));
+				memcpy(dzp[i], dzp[i - 1], local_nz[i] * sizeof(T));
+				memcpy(dzm[i], dzm[i - 1], local_nz[i] * sizeof(T));
+
+				memcpy(dzi[i], dzi[i - 1], local_nz[i] * sizeof(T));
+				memcpy(dzpi[i], dzpi[i - 1], local_nz[i] * sizeof(T));
+				memcpy(dzmi[i], dzmi[i - 1], local_nz[i] * sizeof(T));
+				memcpy(dzp2i[i], dzp2i[i - 1], local_nz[i] * sizeof(T));
+				memcpy(dzm2i[i], dzm2i[i - 1], local_nz[i] * sizeof(T));
+			}
+
+			if (mpi_combine[i]) {
+				// use communicator on finer grid: mpi_com[ i - 1 ] //
+				mpi_com[i - 1].gather_subgrid_z(dz[i], dz[i],
+					mpi_nz[i], local_nz[i], gcz[i]);
+				mpi_com[i - 1].gather_subgrid_z(dzp[i], dzp[i],
+					mpi_nz[i], local_nz[i], gcz[i]);
+				mpi_com[i - 1].gather_subgrid_z(dzm[i], dzm[i],
+					mpi_nz[i], local_nz[i], gcz[i]);
+
+				mpi_com[i - 1].gather_subgrid_z(dzi[i], dzi[i],
+					mpi_nz[i], local_nz[i], gcz[i]);
+				mpi_com[i - 1].gather_subgrid_z(dzpi[i], dzpi[i],
+					mpi_nz[i], local_nz[i], gcz[i]);
+				mpi_com[i - 1].gather_subgrid_z(dzmi[i], dzmi[i],
+					mpi_nz[i], local_nz[i], gcz[i]);
+				mpi_com[i - 1].gather_subgrid_z(dzp2i[i], dzp2i[i],
+					mpi_nz[i], local_nz[i], gcz[i]);
+				mpi_com[i - 1].gather_subgrid_z(dzm2i[i], dzm2i[i],
+					mpi_nz[i], local_nz[i], gcz[i]);
+			}
+			// ---------------------------------------------------------------------- //
+		}
+
+		if (mpi_run[i]) {
+			poisson3d::set_diagonal_inverse(idg[i],
+				mpi_nz[i],
+
+				gcz[i], mpi_nz[i] - gcz[i] - 1,
+
+				dx2i[i], dy2i[i],
+				dzp2i[i], dzm2i[i]);
+
+			mpi_com[i].exchange_halo_z(idg[i], 1, 1, mpi_nz[i],
+				0, 0, gcz[i], 0, 0, gcz[i], 0);
+
+
+			// grid-coefficients modified by omega value //
+			memcpy(idg_omega[i], idg[i], mpi_nz[i] * sizeof(T));
+			mul(idg_omega[i], sm_up_omega[i], mpi_nz[i]);
+		}
+
+		// boundary type setup //
+		bc[i].init(pois_bc_type,
+			mpi_com[i].rank_x, mpi_com[i].rank_y, mpi_com[i].rank_z,
+			mpi_com[i].size_x, mpi_com[i].size_y, mpi_com[i].size_z);
+	}
+}
+
+template< typename T >
+void nse::mg_mpi_poisson3d_data< T > ::clear()
+{
+#ifdef MEASURE_MG_RUN_TIME
+	for (int i = 0; i < mg_max_grids; i++) {	// null timers //
+		run_time[i] = (double)0;
+		smooth_time[i] = (double)0;
+		restrict_time[i] = (double)0;
+		prolongate_time[i] = (double)0;
+	}
+#endif
+
+	const int fine = 0;
+	if ((num_grids == 0) || (mg_memory_size == 0)) return;
+
+	deallocate(idg[fine]);
+
+	deallocate(idg_omega[fine]);
+
+	// free MPI communicators, excluding MPI_COMM_WORLD
+	mpi_com[fine].cleanup();
+	for (int i = fine + 1; i < num_grids; i++) {
+
+		mpi_com[i].cleanup();
+
+		if ((mpi_run[i]) ||
+			((mpi_combine[i]) && (!mpi_run[i])))
+		{
+			deallocate(dz[i], dzi[i]);
+			deallocate(dzp[i], dzm[i]);
+			deallocate(dzpi[i], dzmi[i]);
+			deallocate(dzp2i[i], dzm2i[i]);
+
+			deallocate(x[i], rhs[i]);
+			deallocate(idg[i]);
+
+			deallocate(idg_omega[i]);
+		}
+	}
+
+	num_grids = 0;
+	mg_memory_size = 0;
+}
+// ------------------------------------------------------------------------ //
+
+// Initialization //
+// ------------------------------------------------------------------------ //
+
+// * initialize: multigrid data * //
+template struct nse::mg_poisson3d_data< float >;
+template struct nse::mg_poisson3d_data< double >;
+
+// * initialize: MPI multigrid data * //
+template struct nse::mg_mpi_poisson3d_data< float >;
+template struct nse::mg_mpi_poisson3d_data< double >;
+// ------------------------------------------------------------------------ //
diff --git a/mg-data3d.h b/mg-data3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac975530aa956d26932252671dc424f23bec5e17
--- /dev/null
+++ b/mg-data3d.h
@@ -0,0 +1,140 @@
+#pragma once
+
+// [mg-data3d.h]: 3D Multigrid data
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "wstgrid3d.h"
+#include "pois-def3d.h"
+#include "mg-def3d.h"
+
+
+namespace nse
+{
+	template< typename T >
+	struct mg_poisson3d_data {
+		static const int mg_max_grids = 16;
+
+		T *x[mg_max_grids], *rhs[mg_max_grids],
+			*idg[mg_max_grids];
+
+		T dx[mg_max_grids], dy[mg_max_grids], *dz[mg_max_grids];
+		T *dzi[mg_max_grids];
+
+		T *dzp[mg_max_grids], *dzm[mg_max_grids];
+		T *dzpi[mg_max_grids], *dzmi[mg_max_grids];
+
+		T dx2i[mg_max_grids], dy2i[mg_max_grids];
+		T *dzp2i[mg_max_grids], *dzm2i[mg_max_grids];
+
+		int mg_memory_size;
+		int num_grids;
+
+		nse_const3d::axisType coarse_type[mg_max_grids];
+
+		int nx[mg_max_grids], ny[mg_max_grids], nz[mg_max_grids];
+		int gcx[mg_max_grids], gcy[mg_max_grids], gcz[mg_max_grids];
+
+		int sm_down_iters[mg_max_grids],
+			sm_up_iters[mg_max_grids];
+
+		int sm_color_shift[mg_max_grids];
+
+		T sm_up_omega[mg_max_grids];	// smoother relaxation //
+										// V-down uses gauss-seidel //
+
+		poisson_dynamic_bc bc;		// boundary condition type and shifts
+
+#ifdef MEASURE_MG_RUN_TIME
+		double run_time[mg_max_grids];	// timing-per-grid //
+
+		double smooth_time[mg_max_grids];
+		double restrict_time[mg_max_grids];
+		double prolongate_time[mg_max_grids];
+#endif
+
+		mg_poisson3d_data() : mg_memory_size(0), num_grids(0) { }
+		~mg_poisson3d_data() {
+			clear();
+		}
+
+		void init(const wstGrid3d< T >& grid, const int _num_grids,
+			const int smooth_down_iters,
+			const int smooth_up_iters,
+			const int smooth_direct_iters,
+			const T smooth_up_omega,
+			const T smooth_up_omega_fine,
+			const poissonTypeBC pois_bc_type);
+		void clear();
+	};
+	// -------------------------------------------------------------------- //
+
+
+	template< typename T >
+	struct mg_mpi_poisson3d_data {
+		static const int mg_max_grids = 16;
+		static const int mg_mpi_min_proc_size = 1024;      // mpi minimum size for gathering in bytes //
+
+		nse::mpiCom3d mpi_com[mg_max_grids];	// MPI communicators //
+
+		T *x[mg_max_grids], *rhs[mg_max_grids],
+			*idg[mg_max_grids];
+
+		T dx[mg_max_grids], dy[mg_max_grids], *dz[mg_max_grids];
+		T *dzi[mg_max_grids];
+
+		T *dzp[mg_max_grids], *dzm[mg_max_grids];
+		T *dzpi[mg_max_grids], *dzmi[mg_max_grids];
+
+		T dx2i[mg_max_grids], dy2i[mg_max_grids];
+		T *dzp2i[mg_max_grids], *dzm2i[mg_max_grids];
+
+		T *idg_omega[mg_max_grids];
+
+		int mg_memory_size;
+		int num_grids;
+
+		nse_const3d::axisType coarse_type[mg_max_grids];
+
+		int local_nx[mg_max_grids], local_ny[mg_max_grids], local_nz[mg_max_grids];
+		int mpi_nx[mg_max_grids], mpi_ny[mg_max_grids], mpi_nz[mg_max_grids];
+		int gcx[mg_max_grids], gcy[mg_max_grids], gcz[mg_max_grids];
+
+		int sm_down_iters[mg_max_grids],
+			sm_up_iters[mg_max_grids];
+
+		int sm_color_shift[mg_max_grids];
+
+		T sm_up_omega[mg_max_grids];	// smoother relaxation //
+										// V-down uses gauss-seidel //
+
+		int mpi_run[mg_max_grids];        // [0,1] run smoother flag
+		int mpi_combine[mg_max_grids];    // [0,level] > 0 gather grids on k-th step using mpi_com[ k - 1 ]
+		int mpi_level[mg_max_grids];      // [2^k] number of grid division levels
+
+		poisson_dynamic_bc bc[mg_max_grids];	// boundary condition type and shifts
+
+#ifdef MEASURE_MG_RUN_TIME
+		double run_time[mg_max_grids];	// timing-per-grid //
+
+		double smooth_time[mg_max_grids];
+		double restrict_time[mg_max_grids];
+		double prolongate_time[mg_max_grids];
+#endif
+
+		mg_mpi_poisson3d_data() : mg_memory_size(0), num_grids(0) { }
+		~mg_mpi_poisson3d_data() {
+			clear();
+		}
+
+		void init(const wstGrid3d< T >& grid, const int _num_grids,
+			const int smooth_down_iters,
+			const int smooth_up_iters,
+			const int smooth_direct_iters,
+			const T smooth_up_omega,
+			const T smooth_up_omega_fine,
+			const poissonTypeBC pois_bc_type);
+		void clear();
+	};
+	// -------------------------------------------------------------------- //
+}
diff --git a/mg-def3d.h b/mg-def3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff4aba678f13b2640bbb2263f004fa41510bc501
--- /dev/null
+++ b/mg-def3d.h
@@ -0,0 +1,109 @@
+#pragma once
+
+// [mg-def3d.h]: 3D Multigrid definitions & constants
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "grid3d.h"				// using nse_const3d::axisType //
+
+// -------------------------------------------------------------------- //
+
+
+//#define USE_STRICT_MG			// bilinear interpolation in restriction & prolongation
+//#define MEASURE_MG_RUN_TIME		// measure multigrid run time on all grids //
+
+// -------------------------------------------------------------------- //
+
+namespace nse
+{
+	const double mg_coarse_aspect = (double) 1.95;
+
+	template< typename T >
+	nse_const3d::axisType define_mg_coarsening(const T dx, const T dy, const T dz_min, const T dz_max,
+		const T aspect_ratio);
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline nse::nse_const3d::axisType nse::define_mg_coarsening(const T dx, const T dy, const T dz_min, const T dz_max,
+	const T aspect_ratio)
+{
+	// - define coarse type (run only on active processes)
+
+	if (((dz_min / dx) > aspect_ratio) &&
+		((dz_max / dx) > aspect_ratio) &&
+		((dz_min / dy) > aspect_ratio) &&
+		((dz_max / dy) > aspect_ratio))
+	{
+		return nse_const3d::axisXY;	// - semi-coarsening by xy
+	}
+	if (((dy / dx) > aspect_ratio) &&
+		((dy / dz_min) > aspect_ratio) &&
+		((dy / dz_max) > aspect_ratio))
+	{
+		return nse_const3d::axisXZ;	// - semi-coarsening by xz
+	}
+	if (((dx / dy) > aspect_ratio) &&
+		((dx / dz_min) > aspect_ratio) &&
+		((dx / dz_max) > aspect_ratio))
+	{
+		return nse_const3d::axisYZ;	// - semi-coarsening by yz
+	}
+
+	if ((dx >= dy) && (dx >= dz_max))
+	{
+		if (dy >= dz_max) {
+			if (((dx / dz_min) > aspect_ratio) &&
+				((dx / dz_max) > aspect_ratio))
+			{
+				return nse_const3d::axisZ;	// - semi-coarsening by z
+			}
+		}
+		else
+		{
+			if ((dx / dy) > aspect_ratio)
+			{
+				return nse_const3d::axisY;	// - semi-coarsening by y
+			}
+		}
+
+	}
+	if ((dy >= dx) && (dy >= dz_max))
+	{
+		if (dx >= dz_max) {
+			if (((dy / dz_min) > aspect_ratio) &&
+				((dy / dz_max) > aspect_ratio))
+			{
+				return nse_const3d::axisZ;	// - semi-coarsening by z
+			}
+		}
+		else
+		{
+			if ((dy / dx) > aspect_ratio)
+			{
+				return nse_const3d::axisX;	// - semi-coarsening by x
+			}
+		}
+	}
+	if ((dz_min >= dx) && (dz_min >= dy))
+	{
+		if (dx >= dy) {
+			if (((dz_min / dy) > aspect_ratio) &&
+				((dz_max / dy) > aspect_ratio))
+			{
+				return nse_const3d::axisY;	// - semi-coarsening by y
+			}
+		}
+		else
+		{
+			if (((dz_min / dx) > aspect_ratio) &&
+				((dz_max / dx) > aspect_ratio))
+			{
+				return nse_const3d::axisX;	// - semi-coarsening by x
+			}
+		}
+	}
+
+	return nse_const3d::axisXYZ;	// - full coarsening
+}
+// -------------------------------------------------------------------- //
diff --git a/model-bc.h b/model-bc.h
new file mode 100644
index 0000000000000000000000000000000000000000..e45275bd968ddf5ffd88f0d9448aefb6599128b6
--- /dev/null
+++ b/model-bc.h
@@ -0,0 +1,544 @@
+#pragma once
+
+// [model-bc.h]: model boundary conditions
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-const.h"
+
+#include "wstgrid3d.h"
+#include "pois-setup3d.h"
+#include "nse-bc3d.h"
+
+
+namespace nse
+{
+	// Main boundary conditions
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void velocity_bc(T* U, T* V, T* W,
+		const T Umax, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void intermediate_bc(T* Uinterm, T* Vinterm, T* Winterm, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void push_u_intermediate_bc(T* Uinterm, const wstGrid3d< T >& grid, MPI_Request mpi_req[4]);
+	template< typename T >
+	void push_v_intermediate_bc(T* Vinterm, const wstGrid3d< T >& grid, MPI_Request mpi_req[4]);
+	template< typename T >
+	void push_w_intermediate_bc(T* Winterm, const wstGrid3d< T >& grid, MPI_Request mpi_req[4]);
+
+	template< typename T >
+	void pop_u_intermediate_bc(T* Uinterm, const wstGrid3d< T >& grid, MPI_Request mpi_req[4]);
+	template< typename T >
+	void pop_v_intermediate_bc(T* Vinterm, const wstGrid3d< T >& grid, MPI_Request mpi_req[4]);
+	template< typename T >
+	void pop_w_intermediate_bc(T* Winterm, const wstGrid3d< T >& grid, MPI_Request mpi_req[4]);
+
+	template< typename T >
+	void pressure_bc(T* Pressure, const poisOpt3d< T >& pois_opt, const wstGrid3d< T >& grid);
+
+#ifdef STRATIFICATION
+	template< typename T >
+	void temperature_bc(T* Tsh, const wstGrid3d< T >& grid);
+#endif
+	// -------------------------------------------------------------------------------------------- //
+
+
+	// Boundary conditions for statistics block //
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void pressure_bc_halo(T* Pressure, const poisOpt3d< T >& pois_opt, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+	// MPI exchanges and periodic boundary conditions for statistics block //
+	// -------------------------------------------------------------------------------------------- //
+	//
+	//		in: [Ui * Uj] dissipation opertor
+	//
+	template< typename T >
+	void diffusion_exch(T* Udiff, T* Vdiff, T* Wdiff, const wstGrid3d< T >& grid);
+	
+	template< typename T >
+	void iso_dissipation_exch(T* Isox, T* Isoy, T* Isoz, const wstGrid3d< T >& grid);
+
+#ifdef STRATIFICATION
+	template< typename T >
+	void diffusion_exch(T* Xinterm, const wstGrid3d< T >& grid);
+#endif
+
+#ifdef STRATIFICATION
+	//
+	//		in: [T * U * W], [T * V * W] products calculation
+	//
+	template< typename T >
+	void temperature_halo_exch(T* Tx, const wstGrid3d< T >& grid);
+#endif
+	// -------------------------------------------------------------------------------------------- //
+
+	// 1D boundary conditions
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void c_dirichlet_bc_z(T* X_z, const T x_bottom, const T x_top, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_dirichlet_bc_z(T* X_z, const T x_bottom, const T x_top, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void c_neumann_bc_z(T* X_z, const T x_bottom, const T x_top, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+
+
+	// 2D boundary conditions
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	void c_dirichlet_bc_yz(T* X_yz, const T x_bottom, const T x_top, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_dirichlet_bc_yz(T* X_yz, const T x_bottom, const T x_top, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------------------------------- //
+}
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::velocity_bc(T* U, T* V, T* W,
+	const T Umax, const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hx = 3, hy = 3, hz = 2;
+#else
+	const int hx = 1, hy = 1, hz = 1;
+#endif
+
+	dirichlet_bc(U, -(T) 0.5 * Umax, bottomSide, nodeU, grid);
+	dirichlet_bc(U, (T) 0.5 * Umax, topSide, nodeU, grid);
+
+	dirichlet_bc(V, (T)0, bottomSide, nodeV, grid);
+	dirichlet_bc(V, (T)0, topSide, nodeV, grid);
+
+	dirichlet_bc(W, (T)0, bottomSide, nodeW, grid);
+	dirichlet_bc(W, (T)0, topSide, nodeW, grid);
+
+	// -----------------------------------------------------------------
+	// NOTE[4th order, 2nd order in -z direction][Adv.-X4]:
+	//			3 cell width exch. for U,V,W in -x, -y directions
+	//			2 cell width exch. for U,V in -z direction
+	//			1 cell width exch. for W in -z direction
+	// -----------------------------------------------------------------
+	grid.mpi_com.exchange_halo(U, V, W, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, hy, hz,
+		model_const::domain::period_x,
+		model_const::domain::period_y,
+		model_const::domain::period_z);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::intermediate_bc(T* Uinterm, T* Vinterm, T* Winterm,
+	const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hx = 2, hy = 2, hz = 1;
+#else
+	const int hx = 1, hy = 1, hz = 1;
+#endif
+
+	dirichlet_bc(Winterm, (T)0, bottomSide, nodeW, grid);
+	dirichlet_bc(Winterm, (T)0, topSide, nodeW, grid);
+
+	// -----------------------------------------------------------------
+	// NOTE[4th order, 2nd order in -z direction][Div.-X4]:
+	//			2 cell width exch. in -x, -y directions
+	//			1 cell width exch. in -z direction
+	// -----------------------------------------------------------------
+	grid.mpi_com.exchange_halo_x(Uinterm, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, 0, 0, model_const::domain::period_x);
+	grid.mpi_com.exchange_halo_y(Vinterm, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, 0, hy, 0, model_const::domain::period_y);
+	grid.mpi_com.exchange_halo_z(Winterm, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, 0, 0, hz, model_const::domain::period_z);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::push_u_intermediate_bc(T* Uinterm,
+	const wstGrid3d< T >& grid, MPI_Request mpi_req[4])
+{
+#ifdef SCHEME_X4
+	const int hx = 2;
+#else
+	const int hx = 1;
+#endif
+
+	grid.mpi_com.push_exchange_halo_x(Uinterm, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, 0, 0,
+		model_const::domain::period_x, mpi_req);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::push_v_intermediate_bc(T* Vinterm,
+	const wstGrid3d< T >& grid, MPI_Request mpi_req[4])
+{
+#ifdef SCHEME_X4
+	const int hy = 2;
+#else
+	const int hy = 1;
+#endif
+
+	grid.mpi_com.push_exchange_halo_y(Vinterm, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, 0, hy, 0,
+		model_const::domain::period_y, mpi_req);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::push_w_intermediate_bc(T* Winterm,
+	const wstGrid3d< T >& grid, MPI_Request mpi_req[4])
+{
+	const int hz = 1;
+
+	dirichlet_bc(Winterm, (T)0, bottomSide, nodeW, grid);
+	dirichlet_bc(Winterm, (T)0, topSide, nodeW, grid);
+
+	grid.mpi_com.push_exchange_halo_z(Winterm, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, 0, 0, hz,
+		model_const::domain::period_z, mpi_req);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::pop_u_intermediate_bc(T* Uinterm,
+	const wstGrid3d< T >& grid, MPI_Request mpi_req[4])
+{
+#ifdef SCHEME_X4
+	const int hx = 2;
+#else
+	const int hx = 1;
+#endif
+
+	grid.mpi_com.pop_exchange_halo_x(Uinterm, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, 0, 0,
+		model_const::domain::period_x, mpi_req);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::pop_v_intermediate_bc(T* Vinterm,
+	const wstGrid3d< T >& grid, MPI_Request mpi_req[4])
+{
+#ifdef SCHEME_X4
+	const int hy = 2;
+#else
+	const int hy = 1;
+#endif
+
+	grid.mpi_com.pop_exchange_halo_y(Vinterm, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, 0, hy, 0,
+		model_const::domain::period_y, mpi_req);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::pop_w_intermediate_bc(T* Winterm,
+	const wstGrid3d< T >& grid, MPI_Request mpi_req[4])
+{
+	const int hz = 1;
+
+	grid.mpi_com.pop_exchange_halo_z(Winterm, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, 0, 0, hz,
+		model_const::domain::period_z, mpi_req);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::pressure_bc(T* Pressure, const poisOpt3d< T >& pois_opt,
+	const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hx = (pois_opt.init_mode == isInitPreset) ? 3 : 2;
+	const int hy = (pois_opt.init_mode == isInitPreset) ? 3 : 2;
+	const int hz = 1;
+#else
+	const int hx = 1, hy = 1, hz = 1;
+#endif
+
+	neumann_bc(Pressure, (T)0, bottomSide, nodeC, grid);
+	neumann_bc(Pressure, (T)0, topSide, nodeC, grid);
+
+	// -----------------------------------------------------------------
+	// NOTE[4th order, 2nd order in -z direction][Grad.(Laplace)-X4]:
+	//			2(3) cell width exch. in -x, -y directions
+	//			1(1) cell width exch. in -z direction
+	// -----------------------------------------------------------------
+	grid.mpi_com.exchange_cross_halo(Pressure, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, hy, hz,
+		model_const::domain::period_x,
+		model_const::domain::period_y,
+		model_const::domain::period_z);
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+#ifdef STRATIFICATION
+template< typename T >
+void nse::temperature_bc(T* Tsh,
+	const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hx = 3, hy = 3, hz = 1;
+#else
+	const int hx = 1, hy = 1, hz = 1;
+#endif
+
+	dirichlet_bc(Tsh, (T)0, bottomSide, nodeC, grid);
+	dirichlet_bc(Tsh, (T)0, topSide, nodeC, grid);
+
+	// -----------------------------------------------------------------
+	// NOTE: 2nd order scheme in -z direction
+	//			3 cell width exch. in -x, -y directions
+	//			1 cell width exch. in -z direction
+	// -----------------------------------------------------------------
+	grid.mpi_com.exchange_cross_halo(Tsh, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, hy, hz,
+		model_const::domain::period_x,
+		model_const::domain::period_y,
+		model_const::domain::period_z);
+}
+// -------------------------------------------------------------------------------------------- //
+#endif
+
+
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::pressure_bc_halo(T* Pressure,
+	const poisOpt3d< T >& pois_opt, const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hx = 3, hy = 3, hz = 1;
+#else
+	const int hx = 1, hy = 1, hz = 1;
+#endif
+
+	neumann_bc(Pressure, (T)0, bottomSide, nodeC, grid);
+	neumann_bc(Pressure, (T)0, topSide, nodeC, grid);
+
+	// -----------------------------------------------------------------
+	// NOTE[4th order, 2nd order in -z direction][Grad.(Laplace)-X4]:
+	//			2(3) cell width exch. in -x, -y directions
+	//			1(1) cell width exch. in -z direction
+	// -----------------------------------------------------------------
+	grid.mpi_com.exchange_halo(Pressure, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, hy, hz,
+		model_const::domain::period_x,
+		model_const::domain::period_y,
+		model_const::domain::period_z);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::diffusion_exch(T* Udiff, T* Vdiff, T* Wdiff, const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hx = 3, hy = 3, hz = 1;
+#else
+	const int hx = 1, hy = 1, hz = 1;
+#endif
+
+	// -----------------------------------------------------------------
+	// NOTE: 2nd order scheme in -z direction
+	//			3 cell width exch. in -x, -y directions
+	//			1 cell width exch. in -z direction
+	// -----------------------------------------------------------------
+	grid.mpi_com.exchange_cross_halo(Udiff, Vdiff, Wdiff, 
+		grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, hy, hz,
+		model_const::domain::period_x,
+		model_const::domain::period_y,
+		model_const::domain::period_z);
+}
+
+template< typename T >
+void nse::iso_dissipation_exch(T* Isox, T* Isoy, T* Isoz,
+	const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hx = 3, hy = 3, hz = 1;
+#else
+	const int hx = 1, hy = 1, hz = 1;
+#endif
+
+	// -----------------------------------------------------------------
+	// NOTE: 2nd order scheme in -z direction
+	//			3 cell width exch. in -x, -y directions
+	//			1 cell width exch. in -z direction
+	// -----------------------------------------------------------------
+	grid.mpi_com.exchange_halo_x(Isox, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, 0, 0, model_const::domain::period_x);
+	grid.mpi_com.exchange_halo_y(Isoy, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, 0, hy, 0, model_const::domain::period_y);
+	grid.mpi_com.exchange_halo_z(Isoz, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, 0, 0, hz, model_const::domain::period_z);
+}
+
+#ifdef STRATIFICATION
+template< typename T >
+void nse::diffusion_exch(T* Xdiff, const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hx = 3, hy = 3, hz = 1;
+#else
+	const int hx = 1, hy = 1, hz = 1;
+#endif
+
+	// -----------------------------------------------------------------
+	// NOTE: 2nd order scheme in -z direction
+	//			3 cell width exch. in -x, -y directions
+	//			1 cell width exch. in -z direction
+	// -----------------------------------------------------------------
+	grid.mpi_com.exchange_cross_halo(Xdiff, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, hy, hz,
+		model_const::domain::period_x,
+		model_const::domain::period_y,
+		model_const::domain::period_z);
+}
+#endif
+
+#ifdef STRATIFICATION
+template< typename T >
+void nse::temperature_halo_exch(T* Tx, const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hx = 3, hy = 3, hz = 1;
+#else
+	const int hx = 1, hy = 1, hz = 1;
+#endif
+
+	// -----------------------------------------------------------------
+	// NOTE: 2nd order scheme in -z direction
+	//			3 cell width exch. in -x, -y directions
+	//			1 cell width exch. in -z direction
+	// -----------------------------------------------------------------
+	grid.mpi_com.exchange_halo(Tx, grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz, hx, hy, hz,
+		model_const::domain::period_x,
+		model_const::domain::period_y,
+		model_const::domain::period_z);
+}
+#endif
+// -------------------------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::c_dirichlet_bc_z(T* X_z, const T x_bottom, const T x_top,
+	const wstGrid3d< T >& grid)
+{
+	if (grid.mpi_com.rank_z == 0)
+		X_z[grid.gcz - 1] = (T) 2.0 * x_bottom - X_z[grid.gcz];
+
+	if (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1)
+		X_z[grid.nz - grid.gcz] = (T) 2.0 * x_top - X_z[grid.nz - grid.gcz - 1];
+
+	// *[Note]: exchange up to gcz cells: 
+	//		we need gcz >= 2 for computation of [u'w', v'w', c'w'] production terms
+	//
+	grid.mpi_com.exchange_halo_z(X_z,
+		1, 1, grid.nz, 0, 0, grid.gcz, 0, 0, grid.gcz,
+		model_const::domain::period_z);
+}
+
+template< typename T >
+void nse::w_dirichlet_bc_z(T* X_z, const T x_bottom, const T x_top,
+	const wstGrid3d< T >& grid)
+{
+	if (grid.mpi_com.rank_z == 0)
+		X_z[grid.gcz] = x_bottom;
+
+	if (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1)
+		X_z[grid.nz - grid.gcz] = x_top;
+
+	// *[Note]: exchange up to gcz cells: 
+	//		we need gcz >= 2 for computation of [u'w', v'w', c'w'] production terms
+	//
+	grid.mpi_com.exchange_halo_z(X_z,
+		1, 1, grid.nz, 0, 0, grid.gcz, 0, 0, grid.gcz,
+		model_const::domain::period_z);
+}
+
+template< typename T >
+void nse::c_neumann_bc_z(T* X_z, const T x_bottom, const T x_top,
+	const wstGrid3d< T >& grid)
+{
+	if (grid.mpi_com.rank_z == 0)
+		X_z[grid.gcz - 1] = X_z[grid.gcz] + x_bottom * (T) 0.5 * grid.dzm[grid.gcz];
+
+	if (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1)
+		X_z[grid.nz - grid.gcz] = X_z[grid.nz - grid.gcz - 1] - x_top * (T) 0.5 * grid.dzp[grid.nz - grid.gcz - 1];
+
+	// *[Note]: exchange up to gcz cells: 
+	//		we need gcz >= 2 for computation of [u'w', v'w', c'w'] production terms
+	//
+	grid.mpi_com.exchange_halo_z(X_z,
+		1, 1, grid.nz, 0, 0, grid.gcz, 0, 0, grid.gcz,
+		model_const::domain::period_z);
+}
+// -------------------------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+void nse::c_dirichlet_bc_yz(T* X_yz, const T x_bottom, const T x_top,
+	const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hy = 3, hz = 1;
+#else
+	const int hy = 1, hz = 1;
+#endif
+
+	if (grid.mpi_com.rank_z == 0) {
+		for (int j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			X_yz[j * grid.nz + grid.gcz - 1] =
+			(T) 2.0 * x_bottom - X_yz[j * grid.nz + grid.gcz];
+	}
+	if (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1) {
+		for (int j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			X_yz[j * grid.nz + grid.nz - grid.gcz] =
+			(T) 2.0 * x_top - X_yz[j * grid.nz + grid.nz - grid.gcz - 1];
+	}
+
+	grid.mpi_com.exchange_halo_y(X_yz, 1, grid.ny, grid.nz,
+		0, grid.gcy, grid.gcz, 0, hy, hz,
+		model_const::domain::period_y);
+	grid.mpi_com.exchange_halo_z(X_yz, 1, grid.ny, grid.nz,
+		0, grid.gcy, grid.gcz, 0, hy, hz,
+		model_const::domain::period_z);
+}
+
+template< typename T >
+void nse::w_dirichlet_bc_yz(T* X_yz, const T x_bottom, const T x_top,
+	const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hy = 3, hz = 1;
+#else
+	const int hy = 1, hz = 1;
+#endif
+
+	if (grid.mpi_com.rank_z == 0) {
+		for (int j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			X_yz[j * grid.nz + grid.gcz] = x_bottom;
+	}
+	if (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1) {
+		for (int j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			X_yz[j * grid.nz + grid.nz - grid.gcz] = x_top;
+	}
+
+	grid.mpi_com.exchange_halo_y(X_yz, 1, grid.ny, grid.nz,
+		0, grid.gcy, grid.gcz, 0, hy, hz,
+		model_const::domain::period_y);
+	grid.mpi_com.exchange_halo_z(X_yz, 1, grid.ny, grid.nz,
+		0, grid.gcy, grid.gcz, 0, hy, hz,
+		model_const::domain::period_z);
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/model-const.h b/model-const.h
new file mode 100644
index 0000000000000000000000000000000000000000..97bfe16cdcbe428090a1dbcb4bed86b5b4c65a7f
--- /dev/null
+++ b/model-const.h
@@ -0,0 +1,57 @@
+#pragma once
+
+// [model-const.h]: model constants
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+#include "pois-setup3d.h"
+
+
+namespace model_const
+{
+	namespace domain
+	{
+		// domain periodicty aliases
+		const int period_x = 1, period_y = 1, period_z = 0;
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	namespace grid
+	{
+		// number of ghost cells
+#ifdef SCHEME_X4
+		const int gcx = 3, gcy = 3, gcz = 3;
+#else
+		const int gcx = 2, gcy = 2, gcz = 2;
+#endif
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	namespace fluid
+	{
+		// seed for random number generator
+		const long int disturbance_seed = 10;
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	namespace poisson
+	{
+		// poisson eq. fixed parameters
+		const nse::poissonInitMode init_mode = nse::isInitPreset;
+		const nse::poissonNormMode norm_mode = nse::isNormC;
+
+		const nse::poissonTypeBC bc_type = nse::periodicXY;
+
+		namespace mg
+		{
+			// min grid size in multigrid sequence
+			const int auto_min = 64;
+		}
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	// init-pressure mean correction tolerance
+	const Real pressure_mean_eps = (Real) 1e-5;
+	// -------------------------------------------------------------------------------------------- //
+}
diff --git a/model-default.h b/model-default.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd605fd87adb4138301abdf9e7eea02c6689823c
--- /dev/null
+++ b/model-default.h
@@ -0,0 +1,190 @@
+#pragma once
+
+// [model-default.h]: model default parameters
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+#include <string>
+
+
+namespace model_default {
+
+	namespace domain
+	{
+		const Real x = (Real) 0.0, y = (Real) 0.0, z = (Real) 0.0;
+
+		const Real length = (Real) 6.0;        // - [x]
+		const Real width = (Real) 4.0;         // - [y]
+		const Real height = (Real) 1.0;        // - [z]
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	namespace grid
+	{
+		const int cx = 96, cy = 64, cz = 64;
+
+		const Real ksi_z = (Real) 1.5;			// - [z] stretching parameter
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	namespace mpi_setup
+	{
+		const int dimx = 4, dimy = 2, dimz = 1;
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	namespace time
+	{
+		const Real begin = (Real) 0.0;
+		const Real end = (Real) 4000.0;
+		const Real CFL = (Real) 0.1;
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	namespace fluid
+	{
+		// --- fluid parameters [base] --- //
+		const Real Umax = (Real) 1.0;
+
+		const Real disturbance_amp = (Real) 0.025;						// relative to Umax
+
+		const Real density = (Real) 1.0;
+		const Real viscosity = (Real) 1.0 / (Real) 5200.0;
+
+#ifdef STRATIFICATION
+		// --- fluid parameters [temperature] --- //
+		const Real T0 = (Real) 1.0;
+		const Real TH = (Real) 2.0;
+
+		const Real Prandtl = (Real) 0.7;
+		const Real Richardson = (Real) 0.00;
+		const Real Richardson_init = (Real) 0.00;	// for continuation from some initial field //
+
+		const Real T_gravity_init = (Real) 200.0;
+		const Real T_gravity_period = (Real) 100.0;
+#endif
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+#ifdef INCLUDE_PARTICLES
+	namespace ptcl_opt
+	{
+		const int n = (grid::cx * grid::cy * grid::cz) / 8;
+		const Real begin = (Real)1600.0;
+	}
+	// -------------------------------------------------------------------------------------------- //
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+	namespace ptcl_track_opt
+	{
+		const int n = 128;
+		const Real begin = (Real)1600.0;
+
+		const int group_max_size = 256;
+		const int max_memory = 10 * 1024 * 1024;
+	}
+	// -------------------------------------------------------------------------------------------- //
+#endif
+
+	namespace output
+	{
+		const std::string DIR = "output/";
+
+		const Real begin = (Real) 1800.0;
+		const Real dt = (Real)200.0;
+
+		const bool regular_plt3d_cntrl = true;
+		const bool regular_bin3d_cntrl = true;
+		const bool final_plt3d_cntrl = true;
+
+		const int nscreen = 400;
+
+		namespace profiles
+		{
+			const std::string DIR = "output-rapid/";
+
+			const Real begin = (Real) 2000.0;
+			const Real end = (Real) 2100.0;
+			const Real dt = (Real)2.0;
+
+			const bool plt_cntrl = true;
+		}
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	namespace dump
+	{
+		const std::string DIR = "dump/";
+
+		const Real begin = (Real) 400.0;
+		const Real dt = (Real) 400.0;
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	namespace startup
+	{
+		const std::string DIR = "init/";
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+#ifdef INCLUDE_VISUALRTL
+	namespace visual
+	{
+		const std::string DIR = "visual/";
+
+		const Real begin = (Real) 1600.0;
+		const Real end = (Real)1800.0;
+		const Real dt = (Real) 0.5;
+
+		const Real xmin = domain::x;
+		const Real xmax = domain::x + domain::length;
+
+		const Real ymin = domain::y;
+		const Real ymax = domain::y + domain::width;
+
+		const Real zmin = domain::z;
+		const Real zmax = domain::z + domain::height;
+
+		const int png_resolution = 8;
+	}
+	// -------------------------------------------------------------------------------------------- //
+#endif
+
+	namespace poisson
+	{
+		const Real retol = (Real) 1e-4, abstol = (Real) 1e-3;
+		const int miniters = 1, maxiters = 500;
+
+		const int piters = 1;
+
+		namespace multigrid
+		{
+			const int ngrid = 6;		// use auto definition: = [0]
+
+			const int down_iters = 2, up_iters = 3;
+			const int direct_iters = 5;
+
+			const Real smooth_up_omega = (Real) 1.84;		// smoother relaxation [up] //
+			const Real smooth_up_omega_fine = (Real) 1.64;	// smoother relaxation on fine grid [up] //
+		}
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	namespace stats
+	{
+
+		const Real begin = (Real) 1600.0;
+		const int time_mod = 10;		// stat gather at mod steps: 10 //
+
+		namespace output {
+			const std::string DIR = "output/stat/";
+		}
+
+		namespace dump {
+			const std::string DIR = "dump/stat/";
+		}
+	}
+	// -------------------------------------------------------------------------------------------- //
+}
diff --git a/model-defines.h b/model-defines.h
new file mode 100644
index 0000000000000000000000000000000000000000..acdc3529e4075a2777cfdb7597198446e9b15ab9
--- /dev/null
+++ b/model-defines.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// [model-defines.h]: model macro definitions
+//
+// -------------------------------------------------------------------------------------------- //
+
+
+// -------------------------------------------------------------------------------------------- //
+#define USE_CONFIG		"config.txt"		// using configuration file
+
+#define SET_OPENMP_THREADS	1	// set number of OpenMP threads by hand
+								// else number of threads is determined by OMP_NUM_THREADS
+
+//#define SCHEME_X4							// define for -x4 spatial scheme
+#define AB_TYPE				2				// = adams-bashforth scheme 2[3]
+
+#define STRATIFICATION						// enable stratification
+
+#define PRESSURE_MEAN_CTRL					// control pressure mean (removing mean each time step & init)
+//#define PRESSURE_MEAN_CTRL_IN_DUMP			// remove pressure mean from dump
+
+
+//#define RESTRICT_3D_DUMP					// restrict 3D dump to (U,P,T) fields only
+
+// - data type //
+#define Real float
+
+
+// - stat defintions //
+#define avgPrec	double				// averaging sub-type precision //
+
+//#define FOURIER_SPECTRUM			// fourier U^2(k) spectrum
+
+//#define RESTRICT_STATS_DUMP			// restrict statistics dump
+#define COMPUTE_XT_AVERAGES				// -x, -time averaged velocity vector //
+
+
+#define INCLUDE_PARTICLES				// include particles in simulation
+//#define INCLUDE_PARTICLES_TRACKING		// include particles tracking in simulation
+
+//#define SKIP_PTCL_DUMP_INIT				// skip dump read for particles
+//#define SKIP_PTCL_TRACKING_DUMP_INIT	// skip dump read for particles tracking
+
+//#define INCLUDE_VISUALRTL				// include direct visualization
+// -------------------------------------------------------------------------------------------- //
+
+// : arch type definition
+//
+#define memRUN memCPU
+// ---------------------------------------------------------------
diff --git a/model-eq.hpp b/model-eq.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..492b9ac7349619c853adb887c11d8d65ef0c16c5
--- /dev/null
+++ b/model-eq.hpp
@@ -0,0 +1,666 @@
+#include "model-obj.h"
+#include "model-bc.h"
+
+#include "nse-generic3d.h"
+#include "nse-io3d.h"
+
+#include "nse3d.h"
+#include "pois3d.h"
+#ifdef SCHEME_X4
+#include "nse3d-x4.h"
+#include "pois3d-x4.h"
+#endif
+
+#include <stdio.h>
+
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::init_eq()
+{
+	if (!init_nse_eq()) {
+		print_info(output.DATA_FILE,
+			"FAILURE!: ** Navier-Stokes eq. init **\n");
+
+		clear();
+		return false;
+	}
+	if (grid.mpi_com.rank == 0)
+		printf("\t >> Navier-Stokes equation: OK!\n");
+
+#ifdef STRATIFICATION
+	if (!init_heat_eq()) {
+		print_info(output.DATA_FILE,
+			"FAILURE!: ** heat eq. init **\n");
+
+		clear();
+		return false;
+	}
+	if (grid.mpi_com.rank == 0)
+		printf("\t >> Heat equation: OK!\n");
+#endif
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+// --------------------------- //
+// Init Navier-Stokes equation //
+// --------------------------- //
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::init_nse_eq()
+{
+	bool imp_status = false;
+	bool phi_status = false;
+#if (AB_TYPE == 3)
+	bool impp_status = false;
+#endif
+
+	if (dump.restart) {
+		imp_status = read_binary_3d(dump.VELOCITY_IMP_FILE, dump.restart_index,
+			Uim_p, Vim_p, Wim_p, grid);
+		if (!imp_status)
+		{
+			if (grid.mpi_com.rank == 0) {
+				printf("\n >> WARNING! >> ** failed to read velocity-im*(n-1) dump file: "
+					"""%s"" **\n", dump.VELOCITY_IMP_FILE.c_str());
+				printf("\t >> -- calculating with variables on n-th time step\n\n");
+			}
+		}
+
+#if (AB_TYPE == 3)
+		impp_status = read_binary_3d(dump.VELOCITY_IMPP_FILE, dump.restart_index,
+			Uim_pp, Vim_pp, Wim_pp, grid);
+		if (!impp_status)
+		{
+			if (grid.mpi_com.rank == 0) {
+				printf("\n >> WARNING! >> ** failed to read velocity-im*(n-2) dump file: "
+					"""%s"" **\n", dump.VELOCITY_IMPP_FILE.c_str());
+				printf("\t >> -- calculating with variables on n-th time step\n\n");
+			}
+		}
+#endif
+
+		if (pois_opt.init_mode != isInitNull)
+		{
+			phi_status = read_binary_3d(dump.PHI_PRESSURE_FILE, dump.restart_index,
+				Phi, grid);
+
+			if (!phi_status) {
+				if (grid.mpi_com.rank == 0) {
+					printf("\n >> WARNING! >> ** failed to read pressure-phi dump file: "
+						"""%s"" **\n", dump.PHI_PRESSURE_FILE.c_str());
+					printf("\t >> -- setting null value\n\n");
+				}
+			}
+#ifdef PRESSURE_MEAN_CTRL_IN_DUMP
+			else
+			{
+				T mean_value;
+				do {
+					mean_value = (T)grid.template average<double>(Phi);
+					update(Phi, -mean_value, grid.size);
+				} while (fabs(mean_value) > model_const::pressure_mean_eps);
+			}
+#endif
+		}
+	}
+
+#ifdef SCHEME_X4
+	divergence_x4(Divergence, U, V, W, grid);
+#else
+	divergence(Divergence, U, V, W, grid);
+#endif
+
+	if (!imp_status) {
+#ifdef SCHEME_X4
+		u_advection_skew_x4(Uim_p, U, V, W, grid);
+		u_add_diffusion_x4(Uim_p, U, fluid.kinematic_viscosity, grid);
+
+		v_advection_skew_x4(Vim_p, U, V, W, grid);
+		v_add_diffusion_x4(Vim_p, V, fluid.kinematic_viscosity, grid);
+
+		w_advection_skew_x4(Wim_p, U, V, W, grid);
+		w_add_diffusion_x4(Wim_p, W, fluid.kinematic_viscosity, grid);
+#else
+		u_advection_skew(Uim_p, U, V, W, grid);
+		u_add_diffusion(Uim_p, U, fluid.kinematic_viscosity, grid);
+
+		v_advection_skew(Vim_p, U, V, W, grid);
+		v_add_diffusion(Vim_p, V, fluid.kinematic_viscosity, grid);
+
+		w_advection_skew(Wim_p, U, V, W, grid);
+		w_add_diffusion(Wim_p, W, fluid.kinematic_viscosity, grid);
+#endif
+	}
+
+	intermediate_bc(Uim_p, Vim_p, Wim_p, grid);
+
+#if ( AB_TYPE == 3 )
+	if (!impp_status) {
+		memcpy(Uim_pp, Uim_p, grid.size * sizeof(T));
+		memcpy(Vim_pp, Vim_p, grid.size * sizeof(T));
+		memcpy(Wim_pp, Wim_p, grid.size * sizeof(T));
+	}
+
+	intermediate_bc(Uim_pp, Vim_pp, Wim_pp, grid);
+#endif
+
+	if (!phi_status)
+		null(Phi, grid.size);
+	pressure_bc(Phi, pois_opt, grid);
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+// --------------------------------- //
+// Advance Navier-Stokes equations   //
+// --------------------------------- //
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::advance_nse_eq()
+{
+	double begin_mark = omp_get_wtime();
+
+	MPI_Request mpi_req[12];
+
+	// - U velocity prediction //
+#ifdef SCHEME_X4
+	u_advection_skew_x4(Uim, U, V, W, grid);
+	u_add_diffusion_x4(Uim, U, fluid.kinematic_viscosity, grid);
+#else
+	u_advection_skew(Uim, U, V, W, grid);
+	u_add_diffusion(Uim, U, fluid.kinematic_viscosity, grid);
+
+#endif
+
+#if ( AB_TYPE == 3 )
+	if (time_index < 2)
+		adams_bashforth_x2(Uim, Uim_p, grid);
+	else
+		adams_bashforth_x3(Uim, Uim_p, Uim_pp, grid);
+#else
+	adams_bashforth_x2(Uim, Uim_p, grid);
+#endif
+
+#ifdef SCHEME_X4
+	u_sub_gradient_x4(Uim, Pressure, (T) 1.0 / fluid.density, grid);
+#else
+	u_sub_gradient(Uim, Pressure, (T) 1.0 / fluid.density, grid);
+#endif
+
+	push_u_intermediate_bc(Uim, grid, &mpi_req[0]);
+	// ------------------------------------------------------------------ //
+
+	// - V velocity prediction //
+#ifdef SCHEME_X4
+	v_advection_skew_x4(Vim, U, V, W, grid);
+	v_add_diffusion_x4(Vim, V, fluid.kinematic_viscosity, grid);
+#else
+	v_advection_skew(Vim, U, V, W, grid);
+	v_add_diffusion(Vim, V, fluid.kinematic_viscosity, grid);
+
+#endif
+
+#if ( AB_TYPE == 3 )
+	if (time_index < 2)
+		adams_bashforth_x2(Vim, Vim_p, grid);
+	else
+		adams_bashforth_x3(Vim, Vim_p, Vim_pp, grid);
+#else
+	adams_bashforth_x2(Vim, Vim_p, grid);
+#endif
+
+#ifdef SCHEME_X4
+	v_sub_gradient_x4(Vim, Pressure, (T) 1.0 / fluid.density, grid);
+#else
+	v_sub_gradient(Vim, Pressure, (T) 1.0 / fluid.density, grid);
+#endif
+
+	push_v_intermediate_bc(Vim, grid, &mpi_req[4]);
+	// ------------------------------------------------------------------ //
+
+	// - W velocity prediction //
+#ifdef SCHEME_X4
+	w_advection_skew_x4(Wim, U, V, W, grid);
+	w_add_diffusion_x4(Wim, W, fluid.kinematic_viscosity, grid);
+#else
+	w_advection_skew(Wim, U, V, W, grid);
+	w_add_diffusion(Wim, W, fluid.kinematic_viscosity, grid);
+
+#endif
+
+#if ( AB_TYPE == 3 )
+	if (time_index < 2)
+		adams_bashforth_x2(Wim, Wim_p, grid);
+	else
+		adams_bashforth_x3(Wim, Wim_p, Wim_pp, grid);
+#else
+	adams_bashforth_x2(Wim, Wim_p, grid);
+#endif
+
+#ifdef SCHEME_X4
+	w_sub_gradient_x4(Wim, Pressure, (T) 1.0 / fluid.density, grid);
+#else
+	w_sub_gradient(Wim, Pressure, (T) 1.0 / fluid.density, grid);
+#endif
+
+#ifdef STRATIFICATION
+	const T c_gravity_z = fluid.Richardson_init +
+		(fluid.Richardson - fluid.Richardson_init) *
+		exp_damping(current_time, fluid.T_gravity_init, fluid.T_gravity_period);
+
+#ifdef SCHEME_X4
+	w_buoyancy_x4(Wim, Tsh, c_gravity_z, grid);
+#else
+	w_buoyancy(Wim, Tsh, c_gravity_z, grid);
+#endif
+#endif
+
+	push_w_intermediate_bc(Wim, grid, &mpi_req[8]);
+	// ------------------------------------------------------------------ //
+
+	pop_u_intermediate_bc(Uim, grid, &mpi_req[0]);
+	pop_v_intermediate_bc(Vim, grid, &mpi_req[4]);
+	pop_w_intermediate_bc(Wim, grid, &mpi_req[8]);
+
+#ifdef SCHEME_X4
+	poisson_rhs_x4(Rhs, Divergence,
+		Uim, Vim, Wim, grid, dt);
+#else
+	poisson_rhs(Rhs, Divergence,
+		Uim, Vim, Wim, grid, dt);
+#endif
+
+	if (pois_opt.init_mode == isInitNull)
+		null(Phi, grid.size);
+
+	double pois_begin_mark = omp_get_wtime();
+
+#ifdef SCHEME_X4
+	poisson_status = bicg_mg_sor_redblack_x4(Phi, Rhs, memory,
+		pois_opt, grid, mg_data, &poisson_norm);
+#else
+	poisson_status = bicg_mg_sor_redblack(Phi, Rhs, memory,
+		pois_opt, grid, mg_data, &poisson_norm);
+#endif
+
+	double pois_end_mark = omp_get_wtime();
+	cpu_time.pois += pois_end_mark - pois_begin_mark;
+
+	if (poisson_status < 0) {
+		if (grid.mpi_com.rank == 0)
+			printf("\n >> FAILURE! >> ** poisson solver **\n\n");
+		return false;
+	}
+
+#ifdef PRESSURE_MEAN_CTRL
+	T mean_value;
+	do {
+		mean_value = (T)grid.template average<double>(Phi);
+		update(Phi, -mean_value, grid.size);
+	} while (fabs(mean_value) > model_const::pressure_mean_eps);
+#endif
+
+	pressure_bc(Phi, pois_opt, grid);
+	update(Pressure, fluid.density, Phi, grid.size);
+	// pressure_bc(Pressure, grid);
+	// no boundary conditions and exch. for Pressure //
+	//		OK in case:
+	//			b.c. for (Phi,Pressure) are consistent
+
+
+#ifdef SCHEME_X4
+	u_projection_x4(U, Uim, Phi, grid, dt);
+	v_projection_x4(V, Vim, Phi, grid, dt);
+	w_projection_x4(W, Wim, Phi, grid, dt);
+#else
+	u_projection(U, Uim, Phi, grid, dt);
+	v_projection(V, Vim, Phi, grid, dt);
+	w_projection(W, Wim, Phi, grid, dt);
+#endif
+
+	velocity_bc(U, V, W, fluid.Umax, grid);
+
+#ifdef SCHEME_X4
+	divergence_x4(Divergence, U, V, W, grid);
+#else
+	divergence(Divergence, U, V, W, grid);
+#endif
+
+	double end_mark = omp_get_wtime();
+	cpu_time.nse_eq += end_mark - begin_mark;
+	cpu_time.run += end_mark - begin_mark;
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+#ifdef STRATIFICATION
+// --------------------------- //
+// Init Heat equation          //
+// --------------------------- //
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::init_heat_eq()
+{
+	bool imp_status = false;
+#if (AB_TYPE == 3)
+	bool impp_status = false;
+#endif
+
+	if (dump.restart) {
+		imp_status = read_binary_3d(dump.TEMPERATURE_IMP_FILE, dump.restart_index, Tim_p, grid);
+		if (!imp_status)
+		{
+			if (grid.mpi_com.rank == 0) {
+				printf("\n >> WARNING! >> ** failed to read temperature-im*(n-1) dump file: "
+					"""%s"" **\n", dump.TEMPERATURE_IMP_FILE.c_str());
+				printf("\t >> -- calculating with variables on n-th time step\n\n");
+			}
+		}
+#if (AB_TYPE == 3)
+		impp_status = read_binary_3d(dump.TEMPERATURE_IMPP_FILE, dump.restart_index, Tim_pp, grid);
+		if (!impp_status)
+		{
+			if (grid.mpi_com.rank == 0) {
+				printf("\n >> WARNING! >> ** failed to read temperature-im*(n-2) dump file: "
+					"""%s"" **\n", dump.TEMPERATURE_IMPP_FILE.c_str());
+				printf("\t >> -- calculating with variables on n-th time step\n\n");
+			}
+		}
+#endif
+	}
+
+	if (!imp_status) {
+#ifdef SCHEME_X4
+		c_advection_skew_x4(Tim_p, U, V, W, Tsh, grid);
+		c_advection_skew_vline_x4(Tim_p, U, V, W,
+			fluid.T0, fluid.TH, grid);		// add W, linear -z part //
+		c_add_diffusion_x4(Tim_p, Tsh, fluid.diffusivity, grid);
+#else
+		c_advection_skew(Tim_p, U, V, W, Tsh, grid);
+		c_advection_skew_vline(Tim_p, U, V, W,
+			fluid.T0, fluid.TH, grid);		// add W, linear -z part //
+		c_add_diffusion(Tim_p, Tsh, fluid.diffusivity, grid);
+#endif
+	}
+
+#if ( AB_TYPE == 3 )
+	if (!impp_status)
+		memcpy(Tim_pp, Tim_p, grid.size * sizeof(T));
+#endif
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+// ------------------------------- //
+// Advance Heat equation           //
+// ------------------------------- //
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::advance_heat_eq()
+{
+	double begin_mark = omp_get_wtime();
+
+#ifdef SCHEME_X4
+	c_advection_skew_x4(Tim,
+		U, V, W, Tsh, grid);
+	c_advection_skew_vline_x4(Tim,
+		U, V, W, fluid.T0, fluid.TH, grid);		// add W, linear -z part //
+	c_add_diffusion_x4(Tim, Tsh, fluid.diffusivity, grid);
+#else
+	c_advection_skew(Tim,
+		U, V, W, Tsh, grid);
+	c_advection_skew_vline(Tim,
+		U, V, W, fluid.T0, fluid.TH, grid);			// add W, linear -z part //
+	c_add_diffusion(Tim, Tsh, fluid.diffusivity, grid);
+
+#endif
+
+#if ( AB_TYPE == 3 )
+	if (time_index < 2)
+		adams_bashforth_x2(Tim, Tim_p, grid);
+	else
+		adams_bashforth_x3(Tim, Tim_p, Tim_pp, grid);
+#else
+	adams_bashforth_x2(Tim, Tim_p, grid);
+#endif
+
+	update(Tsh, dt, Tim, grid.size);
+	temperature_bc(Tsh, grid);
+
+	restore_temperature(Tx, Tsh, Tline, grid);
+
+	double end_mark = omp_get_wtime();
+	cpu_time.heat_eq += end_mark - begin_mark;
+	cpu_time.run += end_mark - begin_mark;
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+#endif
+
+#ifdef INCLUDE_PARTICLES
+// ------------------------------- //
+// Advance Particles               //
+// ------------------------------- //
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::advance_particles()
+{
+	double begin_mark = omp_get_wtime();
+
+	if (current_time >= ptcl_opt.begin) {
+
+		if (!ptcl_opt.is_released)
+		{
+			const long int seed = 1024;
+
+			ptcl.add_uniform(ptcl_opt.n, seed, U, V, W, grid);
+			ptcl_opt.is_released = true;
+		}
+
+		ptcl.update(U, V, W,
+			model_const::domain::period_x,
+			model_const::domain::period_y,
+			model_const::domain::period_z, grid, dt);
+	}
+	MPI_Barrier(grid.mpi_com.comm);
+	double end_mark = omp_get_wtime();
+	cpu_time.particles += end_mark - begin_mark;
+	cpu_time.run += end_mark - begin_mark;
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+// ------------------------------- //
+// Advance Track Particles               //
+// ------------------------------- //
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::advance_track_particles()
+{
+	double begin_mark = omp_get_wtime();
+
+	if (current_time >= ptcl_track_opt.begin) {
+
+		if (!ptcl_track_opt.is_released)
+		{
+			const long int seed = 2048;
+
+			ptcl_track.add_uniform(ptcl_track_opt.n, seed, U, V, W, grid);
+			ptcl_track_opt.is_released = true;
+		}
+
+		if (!traj.update(ptcl_track, current_time, grid)) {
+			if (grid.mpi_com.rank == 0)
+				printf(" >> FAILURE! >> ** trajectory update **\n");
+			return false;
+		}
+
+		ptcl_track.update(U, V, W,
+			model_const::domain::period_x,
+			model_const::domain::period_y,
+			model_const::domain::period_z, grid, dt);
+	}
+
+	double end_mark = omp_get_wtime();
+	cpu_time.particles_tracking += end_mark - begin_mark;
+	cpu_time.run += end_mark - begin_mark;
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+#endif
+
+// ------------------------------- //
+// Time Advancement processing     //
+// ------------------------------- //
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::advance_time()
+{
+	T cnorm_div, lnorm_div;			// -C, -L2 divergence norms
+	T u_max, v_max, w_max;			// max velocity components
+	T Urms_max, Vrms_max, Wrms_max;	// max velocity RMS values
+#ifdef STRATIFICATION
+	T Trms_max;						// max temperature RMS value
+#endif
+										// dynamic (friction) velocity //
+	T u_dynamic;					 // average at z=0 & z=H //
+	T dz_visc_min;				// spacing in viscous wall units
+
+	double begin_mark = omp_get_wtime();
+
+	// advance time  //
+	current_time += dt; time_index++;
+	// --------------------------- //
+
+	cnorm_div = mpi_cnorm(Divergence, grid.size, grid.mpi_com.comm);
+	lnorm_div = mpi_lnorm(Divergence, grid.size, grid.mpi_com.comm);
+
+
+	double stats_begin_mark = omp_get_wtime();
+
+	gather_nse_eq_statistics(&Urms_max, &Vrms_max, &Wrms_max);
+#ifdef STRATIFICATION
+	gather_heat_eq_statistics(&Trms_max);
+#endif
+	// statistics time index increment //
+	if (current_time >= stats.begin) stats.time_index++;
+
+	double stats_end_mark = omp_get_wtime();
+	cpu_time.stats += stats_end_mark - stats_begin_mark;
+
+
+#ifdef PRESSURE_MEAN_CTRL
+	T P_mean;
+#endif
+
+	// additional calculations for debug output //
+	if (time_index % nscreen == 0) {
+		// max of velocity components //
+		velocity_abs_max(&u_max, &v_max, &w_max,
+			U, V, W, grid);
+
+		if (current_time >= stats.begin) {
+
+			// friction [dynamic] velocity //
+			u_dynamic = dynamic_velocity();
+			// Spacing in viscous units //
+			dz_visc_min = grid.dz_min * (u_dynamic / fluid.kinematic_viscosity);
+		}
+
+#ifdef PRESSURE_MEAN_CTRL
+		// estimating mean pressure //
+		P_mean = (T)grid.template average<double>(Pressure);
+#endif
+	}
+
+	if ((grid.mpi_com.rank == 0) && (time_index % nscreen == 0)) {
+		printf(" >> poisson eq.: ok! norm = %.8f, iters = %i\n", poisson_norm, poisson_status);
+		printf(" >> divergence [c-norm]: %.8f\n", cnorm_div);
+		printf(" >> divergence [l-norm]: %.8f\n", lnorm_div);
+
+#ifdef PRESSURE_MEAN_CTRL
+		printf(" >> P(mean) = %.7f\n", P_mean); // mean pressure //
+#endif
+		printf(" >> U(max) = %.4f, V(max) = %.4f, W(max) = %.4f\n", u_max, v_max, w_max);
+		printf(" >> U(RMS) = %.4f, V(RMS) = %.4f, W(RMS) = %.4f\n",
+			Urms_max, Vrms_max, Wrms_max);
+
+		if (current_time >= stats.begin) {
+			// some additional output in case we're gathering statistics ...
+			printf(" >> U* = %.4f, z+ = %.4f\n", u_dynamic, (T) 0.5 * dz_visc_min);
+		}
+
+		int est_sec = (int)(cpu_time.run * ((double)
+			((end_time - current_time) / (current_time - begin_time))));
+
+		int est_min = est_sec / 60; est_sec %= 60;
+		int est_hrs = est_min / 60; est_min %= 60;
+
+		printf("\t >> time: %.7f [ETA: %i:%i:%i] [IC: %.4f s]\n", current_time,
+			est_hrs, est_min, est_sec,
+			cpu_time.run / time_index);
+	}
+
+	nse_series.push(0, cnorm_div);
+	nse_series.push(1, lnorm_div);
+	nse_series.push(2, Urms_max);
+	nse_series.push(3, Vrms_max);
+	nse_series.push(4, Wrms_max);
+#ifdef STRATIFICATION
+	nse_series.push(5, Trms_max);
+#endif
+	nse_series.push_time(current_time);
+
+
+	user_post_processing();	// user-defined post processing
+
+
+	double end_mark = omp_get_wtime();
+	cpu_time.run += end_mark - begin_mark;
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::complete(const bool status)
+{
+	if (grid.mpi_com.rank == 0)
+		nse_series.write(output.NSE_SEQ_FILE);
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+	// adding final particles positions ...
+	if (current_time >= ptcl_track_opt.begin) {
+		traj.update(ptcl_track, current_time, grid);
+	}
+#endif
+
+	write_final_output();
+
+	if (status)
+		print_info(output.DATA_FILE, "OK");
+	else
+		print_info(output.DATA_FILE, " FAILURE!: ** advance eq. [nse] **\n");
+
+	double model_run_time = omp_get_wtime() - cpu_time.begin_mark;
+	if (grid.mpi_com.rank == 0) {
+		printf("\n >> Completed model run!\n");
+
+		int run_sec = (int)model_run_time;
+		int run_min = run_sec / 60; run_sec %= 60;
+		int run_hrs = run_min / 60; run_min %= 60;
+
+		printf(" >> Total time:  %i:%i:%i\n\n", run_hrs, run_min, run_sec);
+	}
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/model-init.hpp b/model-init.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5945bfa6fcea8e0c16da71318eae1810213a2a6
--- /dev/null
+++ b/model-init.hpp
@@ -0,0 +1,304 @@
+#include "model-obj.h"
+#include "model-bc.h"
+
+#include "nse-generic3d.h"
+#include "nse-io3d.h"
+
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::init_flow()
+{
+	if (!init_velocity(U, V, W, grid)) {
+		print_info(output.DATA_FILE,
+			"FAILURE!: ** velocity init **\n");
+		clear();
+		return false;
+	}
+	if (grid.mpi_com.rank == 0)
+		printf("\t >> Velocity field: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+	if (!init_pressure(Pressure, grid)) {
+		print_info(output.DATA_FILE,
+			"FAILURE!: ** pressure init **\n");
+		clear();
+		return false;
+	}
+	if (grid.mpi_com.rank == 0)
+		printf("\t >> Pressure field: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+#ifdef STRATIFICATION
+	if (!init_temperature(Tx, Tsh, Tline, grid)) {
+		print_info(output.DATA_FILE,
+			"FAILURE!: ** temperature init **\n");
+
+		clear();
+		return false;
+	}
+	if (grid.mpi_com.rank == 0)
+		printf("\t >> Temperature field: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+#endif
+
+#ifdef INCLUDE_PARTICLES
+	if (!init_particles(grid)) {
+		print_info(output.DATA_FILE,
+			"FAILURE!: ** particles init **\n");
+
+		clear();
+		return false;
+	}
+	if (grid.mpi_com.rank == 0)
+		printf("\t >> Particles vector: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+	if (!init_particles_tracking(grid)) {
+		print_info(output.DATA_FILE,
+			"FAILURE!: ** particles tracking init **\n");
+
+		clear();
+		return false;
+	}
+	if (grid.mpi_com.rank == 0)
+		printf("\t >> Particles tracking: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+#endif
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::init_velocity(T* U, T* V, T* W,
+	const wstGrid3d< T >& grid)
+{
+	null(U, grid.size);
+	null(V, grid.size);
+	null(W, grid.size);
+
+	if (!dump.restart) {
+		if (startup.load_files) {
+			if (!read_binary_3d(startup.VELOCITY_FILE, U, V, W, grid)) {
+				if (grid.mpi_com.rank == 0)
+					printf("\n >> FAILURE! >> ** reading velocity init file: ""%s"" **\n\n",
+						startup.VELOCITY_FILE.c_str());
+				return false;
+			}
+		}
+		else
+		{
+			int i, j, k, shidx, idx;
+			const T Uvalue = (T) 1.0 / grid.mpi_height;
+
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+				{
+					idx = shidx;
+					for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+						U[idx] = fluid.Umax * (-(T) 0.5 + grid.pz[k] * Uvalue);
+					}
+				}
+			}
+
+			int base_seed = (grid.mpi_com.rank + 1) * model_const::fluid::disturbance_seed;
+			add_disturbance(U, fluid.disturbance_amp * fluid.Umax, base_seed, grid);
+			add_disturbance(V, fluid.disturbance_amp * fluid.Umax,
+				base_seed + model_const::fluid::disturbance_seed, grid);
+			add_disturbance(W, fluid.disturbance_amp * fluid.Umax,
+				base_seed + 2 * model_const::fluid::disturbance_seed, grid);
+		}
+	}
+	else
+	{
+		if (!read_binary_3d(dump.VELOCITY_FILE, dump.restart_index, U, V, W, grid))
+		{
+			if (grid.mpi_com.rank == 0)
+				printf("\n >> FAILURE! >> ** reading velocity dump file: ""%s"" **\n\n",
+					dump.VELOCITY_FILE.c_str());
+			return false;
+		}
+	}
+
+	velocity_bc(U, V, W, fluid.Umax, grid);
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::init_pressure(T* Pressure,
+	const wstGrid3d< T >& grid)
+{
+	null(Pressure, grid.size);
+
+	if (!dump.restart) {
+		if (startup.load_files) {
+			if (!read_binary_3d(startup.PRESSURE_FILE, Pressure, grid))
+			{
+				if (grid.mpi_com.rank == 0)
+					printf("\n >> FAILURE! >> ** reading pressure init file: ""%s"" **\n\n",
+						startup.PRESSURE_FILE.c_str());
+				return false;
+			}
+
+#ifdef PRESSURE_MEAN_CTRL
+			T mean_value;
+			do {
+				mean_value = (T)grid.template average<double>(Pressure);
+				update(Pressure, -mean_value, grid.size);
+			} while (fabs(mean_value) > model_const::pressure_mean_eps);
+#endif
+		}
+	}
+	else
+	{
+		if (!read_binary_3d(dump.PRESSURE_FILE, dump.restart_index, Pressure, grid))
+		{
+			if (grid.mpi_com.rank == 0)
+				printf("\n >> FAILURE! >> ** reading pressure dump file: ""%s"" **\n\n",
+					dump.PRESSURE_FILE.c_str());
+			return false;
+		}
+
+#ifdef PRESSURE_MEAN_CTRL_IN_DUMP
+		T mean_value;
+		do {
+			mean_value = (T)grid.template average<double>(Pressure);
+			update(Pressure, -mean_value, grid.size);
+		} while (fabs(mean_value) > model_const::pressure_mean_eps);
+#endif
+	}
+
+	pressure_bc(Pressure, pois_opt, grid);
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+#ifdef STRATIFICATION
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::init_temperature(T* Tx,
+	T* Tsh, T* Tline, const wstGrid3d< T >& grid)
+{
+	null(Tx, grid.size);
+	null(Tsh, grid.size);
+	null(Tline, grid.nz);
+
+	if (!dump.restart) {
+		if (startup.load_files) {
+			if (!read_binary_3d(startup.TEMPERATURE_FILE, Tsh, grid))
+			{
+				if (grid.mpi_com.rank == 0)
+					printf("\n >> FAILURE! >> ** reading temperature init file: ""%s"" **\n\n",
+						startup.TEMPERATURE_FILE.c_str());
+				return false;
+			}
+		}
+	}
+	else
+	{
+		if (!read_binary_3d(dump.TEMPERATURE_FILE, dump.restart_index, Tsh, grid))
+		{
+			if (grid.mpi_com.rank == 0)
+				printf("\n >> FAILURE! >> ** reading temperature dump file: ""%s"" **\n\n",
+					dump.TEMPERATURE_FILE.c_str());
+			return false;
+		}
+	}
+	temperature_bc(Tsh, grid);
+
+	const T Tvalue = ((fluid.TH - fluid.T0) / grid.mpi_height);
+	for (int k = grid.gcz; k < grid.nz - grid.gcz; k++)
+		Tline[k] = fluid.T0 + grid.pz[k] * Tvalue;
+
+	c_dirichlet_bc_z(Tline, fluid.T0, fluid.TH, grid);
+
+	restore_temperature(Tx, Tsh, Tline, grid);
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::restore_temperature(T* Tx,
+	const T* Tsh, const T* Tline, const wstGrid3d< T >& grid)
+{
+#ifdef SCHEME_X4
+	const int hx = 3, hy = 3, hz = 1;
+#else
+	const int hx = 1, hy = 1, hz = 1;
+#endif
+
+	int i, j, k, shidx, idx;
+
+#pragma omp parallel for private(i, j, k, shidx, idx) shared( Tx )
+	for (i = grid.gcx - hx; i < grid.nx - grid.gcx + hx; i++) {
+		shidx = i * grid.nyz + (grid.gcy - hy) * grid.nz + grid.gcz - hz;
+		for (j = grid.gcy - hy; j < grid.ny - grid.gcy + hy; j++, shidx += grid.nz) {
+
+			idx = shidx;
+			for (k = grid.gcz - hz; k < grid.nz - grid.gcz + hz; k++, idx++)
+				Tx[idx] = Tsh[idx] + Tline[k];
+		}
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+#endif
+
+
+#ifdef INCLUDE_PARTICLES
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::init_particles(const wstGrid3d< T >& grid)
+{
+#ifndef SKIP_PTCL_DUMP_INIT
+	if (dump.restart) {
+		if (!ptcl.read_binary(dump.PTCL_FILE, dump.restart_index, grid))
+		{
+			if (grid.mpi_com.rank == 0)
+				printf("\n >> FAILURE! >> ** reading particles dump file: ""%s"" **\n\n",
+					dump.PTCL_FILE.c_str());
+			return false;
+		}
+	}
+#endif
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::init_particles_tracking(const wstGrid3d< T >& grid)
+{
+#ifndef SKIP_PTCL_TRACKING_DUMP_INIT
+	if (dump.restart) {
+		if (!ptcl_track.read_binary(dump.PTCL_TRACK_FILE, dump.restart_index, grid))
+		{
+			if (grid.mpi_com.rank == 0)
+				printf("\n >> FAILURE! >> ** reading particles dump file: ""%s"" **\n\n",
+					dump.PTCL_TRACK_FILE.c_str());
+			return false;
+		}
+
+		if (!traj.read_binary(dump.PTCL_TRAJ_FILE, dump.restart_index, grid))
+		{
+			if (grid.mpi_com.rank == 0)
+				printf("\n >> FAILURE! >> ** reading particles trajectory dump file: ""%s"" **\n\n",
+					dump.PTCL_TRAJ_FILE.c_str());
+			return false;
+		}
+	}
+#endif
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+#endif
diff --git a/model-obj.cpp b/model-obj.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..27619696e40eed4b6185f78a1c21b2852e80470c
--- /dev/null
+++ b/model-obj.cpp
@@ -0,0 +1,12 @@
+#define _CRT_SECURE_NO_WARNINGS
+#include "model-setup.hpp"
+#include "model-init.hpp"
+#include "model-eq.hpp"
+#include "model-out.hpp"
+#include "model-stats.hpp"
+#include "model-user.hpp"
+
+template struct nse::modelObj<Real, nse::memCPU>;
+#ifndef EXCLUDE_GPU_BRANCH
+template struct nse::modelObj<Real, nse::memGPU>;
+#endif
diff --git a/model-obj.h b/model-obj.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5d5bb1411a7c1a1487705d0f72c1ee8ae986f50
--- /dev/null
+++ b/model-obj.h
@@ -0,0 +1,475 @@
+#pragma once
+
+
+// [model-data.h]: main model object
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+
+#include "wstgrid3d.h"
+
+#include "pois-setup3d.h"
+#include "mg-data3d.h"
+
+#include "time-series.h"
+
+#include "config-parser.h"
+
+#include "nse-output.h"
+#include "nse-output-pf.h"
+#include "nse-dump.h"
+#include "nse-startup.h"
+#ifdef INCLUDE_VISUALRTL
+#include "nse-visual.h"
+#endif
+
+#include "stats-data.h"
+#include "nse-turb-vec.h"
+#ifdef FOURIER_SPECTRUM
+#include "nse-fourier-vec.h"
+#endif
+
+#ifdef INCLUDE_PARTICLES
+#include "ptcl-vec3d.h"
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+#include "ptcl-track-vec3d.h"
+#include "traj-accum3d.h"
+#endif
+// -------------------------------------------------------------------------------------------- //
+
+using namespace nse::nse_const3d;
+
+namespace nse
+{
+	template< typename T, memType mem >
+	struct modelObj
+	{
+		// FUNCTIONS [public]:
+		// ------------------------------------------------------------------------------------------------ //
+	public:
+		// model setup
+		// ------------------------------------------------------------------------------------------------ //
+		modelObj();
+		~modelObj();
+
+		static bool init_parallel(int argc, char** argv, const int mpi_mode);
+		static void finalize_parallel();
+
+		bool setup(int argc, char** argv, const MPI_Comm comm);
+		bool setup(int argc, char** argv,
+			const char* config_filename, const MPI_Comm comm);
+
+		//
+		// cleanup MPI-data: call before MPI_Finalize()
+		void mpi_cleanup();
+		//
+		// destructor on request
+		void clear();
+		// ------------------------------------------------------------------------------------------------ //
+
+		// initial conditions
+		// ------------------------------------------------------------------------------------------------ //
+		bool init_flow();
+		// ------------------------------------------------------------------------------------------------ //
+
+		// equations handlers
+		// ------------------------------------------------------------------------------------------------ //
+		bool init_eq();		// initialize all equations
+
+		bool init_nse_eq();
+		bool advance_nse_eq();		// integration of nse eq.
+
+#ifdef STRATIFICATION
+		bool init_heat_eq();
+		bool advance_heat_eq();		// integration of heat eq.
+#endif
+
+#ifdef INCLUDE_PARTICLES
+		bool advance_particles();		// integration of particles eq.
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+		bool advance_track_particles();	// integration of track particles eq.
+#endif
+
+		bool advance_time();		// time advancement and post processing
+
+		bool complete(const bool status);	// model run completion and final output
+
+		bool is_active() const { return (current_time < end_time); }
+		// ------------------------------------------------------------------------------------------------ //
+
+		// statistics
+		// ------------------------------------------------------------------------------------------------ //
+		T dynamic_velocity();
+		// ------------------------------------------------------------------------------------------------ //
+
+		// output & dump
+		// ------------------------------------------------------------------------------------------------ //
+		bool advance_output();		// advance output writes [calling each time step]
+		bool advance_dump();		// advance dump writes [calling each time step]
+
+#ifdef INCLUDE_VISUALRTL
+		bool advance_visualization();	// advence visualization writes [calling each time step]
+#endif
+
+		bool print_dump_to_output();		// printing dump to output
+		void rewrite_restart_dump();		// rewriting model restart dump [index]->[-index]
+
+	protected:
+		// FUNCTIONS [protected]:
+		// ------------------------------------------------------------------------------------------------ //
+
+		// model setup
+		// ------------------------------------------------------------------------------------------------ //
+		bool process_args(int argc, char** argv);
+
+		bool set_working_paths(const MPI_Comm comm);
+		bool set_fluid(const MPI_Comm comm);
+		bool set_grid(const MPI_Comm comm);
+		bool set_time();
+		bool set_io_parameters();
+
+		bool allocate_fields();
+		bool allocate_base_fields();
+		bool allocate_ext_fields();
+
+		bool set_poisson_solver();
+
+		bool set_time_series();
+		bool set_statistics_block();
+
+#ifdef INCLUDE_PARTICLES
+		bool set_particles();
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+		bool set_track_particles();
+#endif
+
+#ifdef INCLUDE_VISUALRTL
+		bool set_visualization();
+#endif
+		// ------------------------------------------------------------------------------------------------ //
+
+		// initial conditions
+		// ------------------------------------------------------------------------------------------------ //
+		bool init_velocity(T* U, T* V, T* W, const wstGrid3d< T >& grid);
+		bool init_pressure(T* Pressure, const wstGrid3d< T >& grid);
+#ifdef STRATIFICATION
+		bool init_temperature(T* Tx,
+			T* Tsh, T* Tline, const wstGrid3d< T >& grid);
+		void restore_temperature(T* Tx,
+			const T* Tsh, const T* Tline, const wstGrid3d< T >& grid);
+#endif
+
+#ifdef INCLUDE_PARTICLES
+		bool init_particles(const wstGrid3d< T >& grid);
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+		bool init_particles_tracking(const wstGrid3d< T >& grid);
+#endif
+		// ------------------------------------------------------------------------------------------------ //
+
+		// statistics
+		// ------------------------------------------------------------------------------------------------ //
+		void gather_nse_eq_statistics(
+			T *U_rms_max, T *V_rms_max, T *W_rms_max);
+#ifdef STRATIFICATION
+		void gather_heat_eq_statistics(T *T_rms_max);
+#endif
+		void calculate_ext_statistics(nseTurbVec< T >& avg);
+		// ------------------------------------------------------------------------------------------------ //
+
+
+		// output & dump
+		// ------------------------------------------------------------------------------------------------ //
+		bool write_output(const int index);
+		bool write_final_output();
+
+		bool write_rapid_output(const int index);
+
+		void write_statistics_output(
+			const int index, nseTurbVec< T >& avg);
+
+		bool write_dump(const int index);
+
+		bool print_info(const std::string& filename, const char* msg_status);
+		bool print_info(const std::string& filename,
+			const int idx, const char* msg_status);
+		// ------------------------------------------------------------------------------------------------ //
+
+		// user-defined post processing
+		// ------------------------------------------------------------------------------------------------ //
+		void user_post_processing();
+		void user_output();
+		// ------------------------------------------------------------------------------------------------ //
+
+
+		// DATA [assuming private]:
+		// ------------------------------------------------------------------------------------------------ //
+	public:
+		// model config
+		// ------------------------------------------------------------------------------------------------ //
+		ConfigParser config;
+		bool use_config;
+		// ------------------------------------------------------------------------------------------------ //
+
+		// grid & time
+		// ------------------------------------------------------------------------------------------------ //
+		wstGrid3d< T > grid;
+
+		T begin_time, end_time;
+		T dt, CFL;
+
+		T current_time;
+		int time_index;
+		// ------------------------------------------------------------------------------------------------ //
+
+		// fluid parameters
+		// ------------------------------------------------------------------------------------------------ //
+		struct {
+			// [nse]
+			T Umax;
+
+			T disturbance_amp;
+
+			T density, viscosity, kinematic_viscosity;
+
+#ifdef STRATIFICATION
+			// [boussinesq]
+			T T0, TH;
+
+			T Prandtl;
+			T Richardson;
+			T Richardson_init;				// initial field Ri number //
+			T diffusivity;
+
+			T T_gravity_init, T_gravity_period;
+#endif
+		} fluid;
+		// ------------------------------------------------------------------------------------------------ //
+
+		// poisson solver
+		// ------------------------------------------------------------------------------------------------ //
+		poisOpt3d< T > pois_opt;
+		mg_mpi_poisson3d_data< T > mg_data;
+		struct {	// multigrid setup parameters
+			int down_iters, up_iters, direct_iters;
+			T smooth_up_omega, smooth_up_omega_fine;
+		} mg_opt;
+
+		T poisson_norm;
+		int poisson_status;
+		// ------------------------------------------------------------------------------------------------ //
+
+		// model main IO
+		// ------------------------------------------------------------------------------------------------ //
+		nseOutput< T > output;
+		nseOutputPf< T > pf_output;	// profile rapid output
+		nseDump< T > dump;
+		nseStartup< T > startup;
+#ifdef INCLUDE_VISUALRTL
+		nseVisual< T > visual;
+#endif
+
+		int nscreen;
+		// ------------------------------------------------------------------------------------------------ //
+
+		// print dump mode
+		// ------------------------------------------------------------------------------------------------ //
+		struct printDumpMode {
+			bool active;			// activation flag, default: [false]
+			bool stats_range;		// use stats range flag, default: [false]
+			T stats_begin, stats_end;	// statistics range
+			int index;						// dump file additional index
+
+			printDumpMode() : active(false), stats_range(false) {}
+		} print_dump;	// print dump to output mode
+
+		bool is_print_dump_mode() const { return print_dump.active; }
+		// ------------------------------------------------------------------------------------------------ //
+
+		// time serieses
+		// ------------------------------------------------------------------------------------------------ //
+		timeSeries nse_series;
+		static const int c_seq_max_length = 100 * 1024;
+		// ------------------------------------------------------------------------------------------------ //
+
+		// statistics data
+		// ------------------------------------------------------------------------------------------------ //
+		statsData stats;
+
+		nseAvgVec<avgPrec> space_avg;		// spatial averages [in -> time averaging]
+		nseTurbVec<T> avg;				// spatial and time averages [averaging -> out]
+#ifdef FOURIER_SPECTRUM
+		nseFourierVec<T> spectrum;
+#endif
+		// ------------------------------------------------------------------------------------------------ //
+
+#ifdef INCLUDE_PARTICLES
+		// particles data
+		// ------------------------------------------------------------------------------------------------ //
+		ptclVec3d<T> ptcl;
+
+		struct {
+			int n;				// number of particles
+			T begin;			// begin time
+
+			bool is_released;	// release status
+
+		} ptcl_opt;
+		// ------------------------------------------------------------------------------------------------ //
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+		// particles tracking data
+		// ------------------------------------------------------------------------------------------------ //
+		ptclTrackVec3d< T > ptcl_track;
+		trajAccum3d< T > traj;
+
+		struct {
+			// particles release parameters
+			int n;					// number of particles 
+			T begin;				// begin time
+
+			bool is_released;		// release status
+
+			int group_max_size;		// maximum trajectory group size 		
+			int max_memory;			// maximum memory in bytes per process for handling trajectories
+
+		} ptcl_track_opt;
+		// ------------------------------------------------------------------------------------------------ //
+#endif
+
+		// main nse fields
+		// ------------------------------------------------------------------------------------------------ //
+		T *U, *V, *W, *Pressure;
+		T *Uim_p, *Vim_p, *Wim_p, *Uim, *Vim, *Wim;
+#if ( AB_TYPE == 3 )
+		T *Uim_pp, *Vim_pp, *Wim_pp;
+#endif
+		T *Phi, *Rhs, *Divergence;
+		T *memory[6];				// poisson solver additional memory
+
+#ifdef STRATIFICATION
+		T *Tsh, *Tx, *Tline;
+		T *Tim, *Tim_p;
+#if ( AB_TYPE == 3 )
+		T *Tim_pp;
+#endif
+#endif
+		// extended nse fields declaration
+		// ------------------------------------------------------------------------------------------------ //
+		T *U2_u, *V2_v, *W2_w,					// nodes: [U], [V], [W]
+			*U2_uw, *V2_vw, *W2_c;					// nodes: [UW], [VW], [C]
+		T *W2_u, *W2_v,							// nodes: [U, V]
+			*W2_uw, *W2_vw;							// nodes: [UW, VW]
+		
+		T *UV, *UW, *VW,							// nodes: [UV], [UW], [VW]
+			*UV_uvw, *UW_uvw, *VW_uvw;				// nodes: [UVW], [UVW], [UVW]
+		T *PU, *PV, *PW;							// nodes: [U], [V], [W]
+
+		T *UW_bottom, *UW_top,					// nodes: [U (UW -- U)], [U (UW -- U)]
+			*VW_bottom, *VW_top,					// nodes: [V (VW -- V)], [V (VW -- V)]
+			*UW_bottom_uv, *UW_top_uv,				// nodes: [UV (UVW -- UV)], [UV (UVW -- UV)]
+			*VW_bottom_uv, *VW_top_uv,				// nodes: [UV (UVW -- UV)], [UV (UVW -- UV)]
+			*UW_bottom_uw, *UW_top_uw,				// nodes: [UW (U -- UW), [UW (U -- UW)]
+			*VW_bottom_vw, *VW_top_vw;				// nodes: [VW (V -- VW), [VW (V -- VW)]
+		T *UW_adv, *VW_adv;						// nodes: [UW], [VW]
+
+		T *U2W, *V2W, *W2W;						// nodes: [UW], [VW], [C]
+		T *UVW, *UWW, *VWW;						// nodes: [UVW], [U], [V]
+
+		T *U_diff, *V_diff, *W_diff;				// nodes: [U, V, W]
+		T *U_diss, *V_diss, *W_diss;				// nodes: [U], [V], [W]
+		T *UV_diss, *UW_diss, *VW_diss;			// nodes: [UV], [UW], [VW]
+
+		T *U_iso_diss, *V_iso_diss, *W_iso_diss;		// nodes: [U], [V], [W]
+		T *UV_iso_diss, *UW_iso_diss, *VW_iso_diss,	// nodes: [UV], [UW], [VW]
+			*iso_diss_x, *iso_diss_y, *iso_diss_z;		// nodes:
+														//		UV = [V, U, UVW]
+														//		UW = [W, UVW, U]
+														//		VW = [UVW, W, V]
+
+		T *PSuu, *PSvv, *PSww;					// nodes: [C], [C], [C]
+		T *P2Suv, *P2Suw, *P2Svw;				// nodes: [UV,UW,VW]
+
+
+#ifdef STRATIFICATION
+		T *T2_c, *T2_w;							// nodes: [C], [W]
+
+		T *TU, *TV, *TW,							// nodes: [U], [V], [W]
+			*TU_uw, *TV_vw,							// nodes: [UW, VW]
+			*TW_uw, *TW_vw;							// nodes: [UW, VW]
+		T *TP;									// nodes: [C]
+
+		T *TW_bottom, *TW_top,					// nodes: [C (W -- C)], [C (W -- C)]
+			*TW_bottom_u, *TW_top_u,				// nodes: [U (UW -- U)], [U (UW -- U)]
+			*TW_bottom_v, *TW_top_v,				// nodes: [V (VW -- V)], [V (VW -- V)]
+			*TW_bottom_w, *TW_top_w;				// nodes: [W (C -- W)], [W (C -- W)]
+		T *TW_adv;								// nodes: [W]
+
+		T *T2W;									// nodes: [W]
+		T *TUW, *TVW, *TWW;						// nodes: [UW, VW, C]
+		
+		T *T_diff;								// nodes: [C]
+		T *T_diss;								// nodes: [C]
+		T *TU_diss, *TV_diss, *TW_diss;			// nodes: [U, V, W]
+
+		T *T_iso_diss;							// nodes: [C]
+		
+		T *T_dPdx, *T_dPdy, *T_dPdz;				// nodes: [U,V,W]
+#endif
+
+
+
+		struct cpuTime {	// CPU timers
+			double begin_mark;
+
+			double run;
+			double nse_eq;
+#ifdef STRATIFICATION
+			double heat_eq;
+#endif
+#ifdef INCLUDE_PARTICLES
+			double particles;
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+			double particles_tracking;
+#endif
+			double pois;
+			double stats;
+
+			cpuTime() : begin_mark((double)0),
+				run((double)0), nse_eq((double)0),
+#ifdef STRATIFICATION
+				heat_eq((double)0),
+#endif
+#ifdef INCLUDE_PARTICLES
+				particles((double)0),
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+				particles_tracking((double)0),
+#endif
+				pois((double)0), stats((double)0) {}
+		} cpu_time;
+		// ------------------------------------------------------------------------------------------------ //
+
+		// Allocation marks for destructor
+		// ------------------------------------------------------------------------------------------------ //
+		struct allocationStatus
+		{
+			bool grid, base_fields, ext_fields,
+				poisson, multigrid;
+
+			allocationStatus() :
+				grid(false),
+				base_fields(false), ext_fields(false),
+				poisson(false), multigrid(false) {}
+
+		} allocation_status;
+		// ------------------------------------------------------------------------------------------------ //
+	};
+}
diff --git a/model-out.hpp b/model-out.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..44d8c5deea314eb3db69396a89232b674255ae3a
--- /dev/null
+++ b/model-out.hpp
@@ -0,0 +1,837 @@
+#define _CRT_SECURE_NO_WARNINGS
+#include "model-obj.h"
+#include "model-const.h"
+#include "model-bc.h"
+
+#include "nse-io3d.h"
+
+#ifdef INCLUDE_VISUALRTL
+#include "gl-wstgrid3d.h"
+#if defined(INCLUDE_PARTICLES) || defined(INCLUDE_PARTICLES_TRACKING)
+#include "gl-ptcl-vec3d.h"
+#endif
+#include "gl-render.h"
+#endif
+
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::advance_output()
+{
+	if (nse_series.length() >= c_seq_max_length)
+	{
+		if (grid.mpi_com.rank == 0)
+			nse_series.write(output.NSE_SEQ_FILE);
+		nse_series.reset();
+	}
+
+	if (current_time >= output.mark)
+	{
+		if (grid.mpi_com.rank == 0)
+			printf("\n >> Writing model output... ");
+
+		write_output(output.index);
+		if (current_time >= stats.begin) {	// statistics //
+
+			double begin_mark = omp_get_wtime();
+
+			stats.get_averages(avg);
+			calculate_ext_statistics(avg);
+
+			double end_mark = omp_get_wtime();
+			cpu_time.stats += end_mark - begin_mark;
+
+			write_statistics_output(output.index, avg);
+		}
+
+		user_output();
+
+		if (grid.mpi_com.rank == 0)
+			printf("OK!\n\n");
+
+		output.mark += output.dt;
+		output.index++;
+	}
+
+
+	if ((current_time >= pf_output.mark) && (pf_output.mark <= pf_output.end))
+	{
+		write_rapid_output(pf_output.index);
+
+		pf_output.mark += pf_output.dt;
+		pf_output.index++;
+	}
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::advance_dump()
+{
+	const double model_run_time_eps = (double)0.99;
+
+	if (current_time >= dump.mark) {
+
+		if (grid.mpi_com.rank == 0)
+			printf("\n >> Writing model dump... ");
+
+		write_dump(dump.index);
+		if (current_time >= stats.begin) {	// statistics //
+			stats.write_dump(dump.index, current_time, grid);
+		}
+
+		if (grid.mpi_com.rank == 0)
+			printf("OK!\n\n");
+
+		dump.mark += dump.dt;
+		dump.index++;
+	}
+	else
+	{
+		if (dump.edge_mode) {
+			double model_run_time = omp_get_wtime() - cpu_time.begin_mark;
+
+			int check_edge = (model_run_time >= model_run_time_eps * dump.max_run_time);
+			mpi_allreduce(&check_edge, MPI_MAX);
+
+			if (check_edge) {
+
+				if (grid.mpi_com.rank == 0)
+					printf("\n >> Writing model limit-dump... ");
+
+				write_dump(dump.index);
+				if (current_time >= stats.begin) {	// statistics //
+					stats.write_dump(dump.index, current_time, grid);
+				}
+
+				if (grid.mpi_com.rank == 0)
+					printf("OK!\n\n");
+
+				dump.edge_mode = false;
+				dump.index++;
+			}
+		}
+	}
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+
+#ifdef INCLUDE_VISUALRTL
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::advance_visualization()
+{
+	const T camera_x = -(T)9.0;
+	const T camera_y = -(T)6.0;
+	const T camera_z = (T)3.0;
+
+	const T focal_x = (T)0.0;
+	const T focal_y = (T)0.0;
+	const T focal_z = (T)1.0;
+
+	const T view_up_x = (T)0.0;
+	const T view_up_y = (T)0.0;
+	const T view_up_z = (T)1.0;
+
+	const T Umin = (T)-0.55, Umax = (T)0.55;
+
+	//double begin_mark = omp_get_wtime();
+
+	if ((current_time >= visual.mark) &&
+		(visual.mark <= visual.end))
+	{
+		// setting render properties
+		nse_visual::render::set_magnification(visual.png_resolution, grid.mpi_com.comm);
+		nse_visual::render::set_background_color((T)1.0, (T)1.0, (T)1.0, grid.mpi_com.comm);
+
+		// setting camera
+		nse_visual::render::set_camera_position(camera_x, camera_y, camera_z, grid.mpi_com.comm);
+		nse_visual::render::set_camera_focal_point(focal_x, focal_y, focal_z, grid.mpi_com.comm);
+		nse_visual::render::set_camera_view_up(view_up_x, view_up_y, view_up_z, grid.mpi_com.comm);
+
+
+		// rendering U image ...
+		nse_visual::colorMap::set_color_table(visualRTL::RAINBOW_BLUE_TO_RED, 32, grid.mpi_com.comm);
+		nse_visual::colorMap::set_bounding_box(grid.mpi_com.comm);
+		nse_visual::colorMap::set_contour_value((T)0.0, grid.mpi_com.comm);
+
+		nse_visual::colorMap::set_range(Umin, Umax, grid.mpi_com.comm);
+		nse_visual::colorMap::add(U,
+			visual.xmin, visual.xmax,
+			visual.ymin, visual.ymax,
+			visual.zmin, visual.zmax, nodeU, grid);
+
+		nse_visual::render::write_png(visual.U_VELOCITY_IMG, visual.index, grid.mpi_com.comm);
+
+		// clear renderer tape
+		nse_visual::render::remove_all(grid.mpi_com.comm);
+
+
+		visual.mark += visual.dt;
+		visual.index++;
+	}
+
+	//double end_mark = omp_get_wtime();
+	//cpu_time.visual += end_mark - begin_mark;
+	//cpu_time.run += end_mark - begin_mark;
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+#endif
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::write_output(const int index)
+{
+	bool status;
+
+	if (output.regular_plt3d_cntrl)
+	{
+		write_tecplot_3d(output.VELOCITY_FILE, index,
+			U, V, W, "U", "V", "W",
+			output.xmin, output.xmax,
+			output.ymin, output.ymax,
+			output.zmin, output.zmax,
+			grid, current_time);
+
+		write_tecplot_3d(output.PRESSURE_FILE, index,
+			Pressure, "Pressure",
+			output.xmin, output.xmax,
+			output.ymin, output.ymax,
+			output.zmin, output.zmax,
+			nodeC, grid, current_time);
+#ifdef STRATIFICATION
+		write_tecplot_3d(output.TEMPERATURE_FILE, index,
+			Tx, "Temperature",
+			output.xmin, output.xmax,
+			output.ymin, output.ymax,
+			output.zmin, output.zmax,
+			nodeC, grid, current_time);
+#endif
+
+#ifdef INCLUDE_PARTICLES
+		if (current_time >= ptcl_opt.begin)
+		{
+			ptcl.write_tecplot(output.PTCL_FILE.c_str(), index,
+				grid, current_time);
+		}
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+		if (current_time >= ptcl_track_opt.begin)
+		{
+			traj.write_tecplot(output.PTCL_TRAJ_FILE.c_str(), index,
+				grid);
+		}
+#endif
+	}
+
+	if (output.regular_bin3d_cntrl)
+	{
+		status = write_binary_3d(output.VELOCITY_BIN_FILE, index,
+			U, V, W, "U", "V", "W", grid, current_time);
+		if ((!status) && (grid.mpi_com.rank == 0)) {
+			printf("\n >> WARNING! >> ** failed to write velocity binary field: ""%s""\n",
+				output.VELOCITY_BIN_FILE.c_str());
+		}
+
+		status = write_binary_3d(output.PRESSURE_BIN_FILE, index,
+			Pressure, "Pressure", grid, current_time);
+		if ((!status) && (grid.mpi_com.rank == 0)) {
+			printf("\n >> WARNING! >> ** failed to write pressure binary field: ""%s""\n",
+				output.PRESSURE_BIN_FILE.c_str());
+		}
+
+#ifdef STRATIFICATION
+		status = write_binary_3d(output.TEMPERATURE_BIN_FILE, index,
+			Tx, "Temperature", grid, current_time);
+		if ((!status) && (grid.mpi_com.rank == 0)) {
+			printf("\n >> WARNING! >> ** failed to write temperature binary field: ""%s""\n",
+				output.TEMPERATURE_BIN_FILE.c_str());
+		}
+#endif
+	}
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::write_final_output()
+{
+	bool status;
+
+	status = write_binary_3d(output.VELOCITY_BIN_FILE,
+		U, V, W, "U", "V", "W", grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write velocity binary field: ""%s""\n",
+			output.VELOCITY_BIN_FILE.c_str());
+	}
+
+	status = write_binary_3d(output.PRESSURE_BIN_FILE,
+		Pressure, "Pressure", grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write pressure binary field: ""%s""\n",
+			output.PRESSURE_BIN_FILE.c_str());
+	}
+
+#ifdef STRATIFICATION
+	status = write_binary_3d(output.TEMPERATURE_BIN_FILE,
+		Tx, "Temperature", grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write temperature binary field: ""%s""\n",
+			output.TEMPERATURE_BIN_FILE.c_str());
+	}
+#endif
+
+#ifdef INCLUDE_PARTICLES
+	if (current_time >= ptcl_opt.begin) {
+		status = ptcl.write_binary(output.PTCL_BIN_FILE,
+			grid, current_time);
+		if ((!status) && (grid.mpi_com.rank == 0)) {
+			printf("\n >> WARNING! >> ** failed to write particles binary vector: ""%s""\n",
+				output.PTCL_BIN_FILE.c_str());
+		}
+	}
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+	if (current_time >= ptcl_track_opt.begin) {
+		status = traj.write_binary(output.PTCL_TRAJ_BIN_FILE, grid);
+		if ((!status) && (grid.mpi_com.rank == 0)) {
+			printf("\n >> WARNING! >> ** failed to write particles trajectories binary: ""%s""\n",
+				output.PTCL_TRAJ_BIN_FILE.c_str());
+		}
+	}
+#endif
+
+	if (output.final_plt3d_cntrl)
+	{
+		write_tecplot_3d(output.VELOCITY_FILE,
+			U, V, W, "U", "V", "W",
+			output.xmin, output.xmax,
+			output.ymin, output.ymax,
+			output.zmin, output.zmax,
+			grid, current_time);
+
+		write_tecplot_3d(output.PRESSURE_FILE,
+			Pressure, "Pressure",
+			output.xmin, output.xmax,
+			output.ymin, output.ymax,
+			output.zmin, output.zmax,
+			nodeC, grid, current_time);
+#ifdef STRATIFICATION
+		write_tecplot_3d(output.TEMPERATURE_FILE,
+			Tx, "Temperature",
+			output.xmin, output.xmax,
+			output.ymin, output.ymax,
+			output.zmin, output.zmax,
+			nodeC, grid, current_time);
+#endif
+
+#ifdef INCLUDE_PARTICLES
+		if (current_time >= ptcl_opt.begin) {
+			ptcl.write_tecplot(output.PTCL_FILE.c_str(),
+				grid, current_time);
+		}
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+		if (current_time >= ptcl_track_opt.begin) {
+			traj.write_tecplot(output.PTCL_TRAJ_FILE.c_str(),
+				grid);
+		}
+#endif
+	}
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::write_rapid_output(const int index)
+{
+	const T xcoord = (T)0.5;			// -x relative coordinate
+	const T zcoord[nseOutputPf<T>::npy] =	// -z relative coordinates
+	{ (T)0.1, (T)0.25, (T)0.5, (T)0.75, (T)0.9 };
+	const char* zcoord_name[nseOutputPf<T>::npy] =
+	{ "z=0.1H", "z=0.25H", "z=0.5H", "z=0.75H", "z=0.9H" };
+
+
+	if (pf_output.plt_cntrl)
+	{
+		for (int k = 0; k < nseOutputPf<T>::npy; k++) {
+			grid.u_profile_at_xz(pf_output.Upy[k], U,
+				xcoord * grid.mpi_length, zcoord[k] * grid.mpi_height);
+			grid.v_profile_at_xz(pf_output.Vpy[k], V,
+				xcoord * grid.mpi_length, zcoord[k] * grid.mpi_height);
+			grid.w_profile_at_xz(pf_output.Wpy[k], W,
+				xcoord * grid.mpi_length, zcoord[k] * grid.mpi_height);
+
+			grid.mpi_com.exchange_halo_y(pf_output.Vpy[k],
+				1, grid.ny, 1, 0, grid.gcy, 0, 0, 1, 0,
+				model_const::domain::period_y);
+		}
+
+		write_tecplot_1d(pf_output.U_VELOCITY_FILE, index,
+			pf_output.Upy, zcoord_name, nseOutputPf<T>::npy, axisY, nodeC, grid, current_time);
+		write_tecplot_1d(pf_output.V_VELOCITY_FILE, index,
+			pf_output.Vpy, zcoord_name, nseOutputPf<T>::npy, axisY, nodeV, grid, current_time);
+		write_tecplot_1d(pf_output.W_VELOCITY_FILE, index,
+			pf_output.Wpy, zcoord_name, nseOutputPf<T>::npy, axisY, nodeC, grid, current_time);
+	}
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::write_dump(const int index)
+{
+	bool status;
+
+	binStamp< int > index_stamp;
+	binStamp< double > cpu_stamp;
+
+	index_stamp.push(time_index);
+	index_stamp.push(stats.time_index);
+
+	cpu_stamp.push(cpu_time.run);
+	cpu_stamp.push(cpu_time.nse_eq);
+	cpu_stamp.push(cpu_time.pois);
+	cpu_stamp.push(cpu_time.stats);
+	cpu_stamp.push(grid.mpi_com.cpu_time_exch);
+#ifdef STRATIFICATION
+	cpu_stamp.push(cpu_time.heat_eq);
+#endif
+	cpu_stamp.push(grid.mpi_com.cpu_time_exch_x);
+	cpu_stamp.push(grid.mpi_com.cpu_time_exch_y);
+	cpu_stamp.push(grid.mpi_com.cpu_time_exch_z);
+
+#ifdef INCLUDE_PARTICLES
+	cpu_stamp.push(cpu_time.particles);
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+	cpu_stamp.push(cpu_time.particles_tracking);
+#endif
+
+
+	status = write_binary_stamp(dump.NSE_STAMP_FILE, index,
+		index_stamp, cpu_stamp, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write binary stamp dump[%i]: ""%s""\n",
+			index, dump.NSE_STAMP_FILE.c_str());
+	}
+
+	if (grid.mpi_com.rank == 0) {
+		nse_series.write(output.NSE_SEQ_FILE);
+		copy_file(output.NSE_SEQ_FILE, dump.NSE_SEQ_FILE, index);
+	}
+	nse_series.reset();
+
+	status = write_binary_3d(dump.VELOCITY_FILE, index,
+		U, V, W, "U", "V", "W", grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write velocity dump[%i]: ""%s""\n",
+			index, dump.VELOCITY_FILE.c_str());
+	}
+
+#ifndef RESTRICT_3D_DUMP
+	// - setting boundary conditions on (Uim_p, Vim_p, Wim_p):
+	//	  needed only for consistency in dump files,  NO  influence on computations
+	//                                               ----
+	intermediate_bc(Uim_p, Vim_p, Wim_p, grid);
+
+	status = write_binary_3d(dump.VELOCITY_IMP_FILE, index,
+		Uim_p, Vim_p, Wim_p, "Uim_p", "Vim_p", "Wim_p", grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write velocity*(n-1) dump[%i]: ""%s""\n",
+			index, dump.VELOCITY_IMP_FILE.c_str());
+	}
+#if (AB_TYPE==3)
+	// - setting boundary conditions on (Uim_pp, Vim_pp, Wim_pp):
+	//	  needed only for consistency in dump files,  NO  influence on computations
+	//                                               ----
+	intermediate_bc(Uim_pp, Vim_pp, Wim_pp, grid);
+
+	status = write_binary_3d(dump.VELOCITY_IMPP_FILE, index,
+		Uim_pp, Vim_pp, Wim_pp, "Uim_pp", "Vim_pp", "Wim_pp", grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write velocity*(n-2) dump[%i]: ""%s""\n",
+			index, dump.VELOCITY_IMPP_FILE.c_str());
+	}
+#endif
+#endif
+
+	status = write_binary_3d(dump.PRESSURE_FILE, index,
+		Pressure, "Pressure", grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write pressure dump[%i]: ""%s""\n",
+			index, dump.PRESSURE_FILE.c_str());
+	}
+
+#ifndef RESTRICT_3D_DUMP
+	if (pois_opt.init_mode != isInitNull) {
+		status = write_binary_3d(dump.PHI_PRESSURE_FILE, index,
+			Phi, "Phi-Pressure", grid, current_time);
+		if ((!status) && (grid.mpi_com.rank == 0)) {
+			printf("\n >> WARNING! >> ** failed to write phi-pressure dump[%i]: ""%s""\n",
+				index, dump.PHI_PRESSURE_FILE.c_str());
+		}
+	}
+#endif
+
+#ifdef STRATIFICATION
+	status = write_binary_3d(dump.TEMPERATURE_FILE, index,
+		Tsh, "Temperature", grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write temperature dump[%i]: ""%s""\n",
+			index, dump.TEMPERATURE_FILE.c_str());
+	}
+
+#ifndef RESTRICT_3D_DUMP
+	status = write_binary_3d(dump.TEMPERATURE_IMP_FILE, index,
+		Tim_p, "Tim_p", grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write temperature*(n-1) dump[%i]: ""%s""\n",
+			index, dump.TEMPERATURE_IMP_FILE.c_str());
+	}
+#if (AB_TYPE==3)
+	status = write_binary_3d(dump.TEMPERATURE_IMPP_FILE, index,
+		Tim_pp, "Tim_pp", grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write temperature*(n-2) dump[%i]: ""%s""\n",
+			index, dump.TEMPERATURE_IMPP_FILE.c_str());
+	}
+#endif
+#endif
+#endif
+
+#ifdef INCLUDE_PARTICLES
+	status = ptcl.write_binary(dump.PTCL_FILE.c_str(), index,
+		grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write particles dump[%i]: ""%s""\n",
+			index, dump.PTCL_FILE.c_str());
+	}
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+	status = ptcl_track.write_binary(dump.PTCL_TRACK_FILE.c_str(), index,
+		grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write particles track dump[%i]: ""%s""\n",
+			index, dump.PTCL_TRACK_FILE.c_str());
+	}
+
+	status = traj.write_binary(dump.PTCL_TRAJ_FILE.c_str(), index, grid);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write particles trajectories dump[%i]: ""%s""\n",
+			index, dump.PTCL_TRAJ_FILE.c_str());
+	}
+#endif
+
+
+	print_info(dump.DATA_FILE, index, "OK - DUMP");
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::print_dump_to_output()
+{
+	if (grid.mpi_com.rank == 0) {
+		printf("\n >> Printing dump");
+		if (print_dump.stats_range) {
+			printf(", time range = [%.4f - %.4f]",
+				print_dump.stats_begin, print_dump.stats_end);
+		}
+		printf("... ");
+	}
+
+	write_output(print_dump.index);
+
+
+	if (current_time >= stats.begin) {	// statistics //
+
+		if (print_dump.stats_range)
+		{
+			double begin_mark = omp_get_wtime();
+
+			stats.get_averages(avg, print_dump.stats_begin, print_dump.stats_end);
+			calculate_ext_statistics(avg);
+
+			double end_mark = omp_get_wtime();
+			cpu_time.stats += end_mark - begin_mark;
+		}
+		else
+		{
+			double begin_mark = omp_get_wtime();
+
+			stats.get_averages(avg);
+			calculate_ext_statistics(avg);
+
+			double end_mark = omp_get_wtime();
+			cpu_time.stats += end_mark - begin_mark;
+		}
+
+		write_statistics_output(print_dump.index, avg);
+	}
+
+	if (grid.mpi_com.rank == 0)
+		printf("OK!\n\n");
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::rewrite_restart_dump()
+{
+	if (dump.restart) {
+		if (grid.mpi_com.rank == 0)
+			printf("\n >> Re-writing restart dump [%i]->[%i]... ",
+				dump.restart_index, -dump.restart_index);
+
+		write_dump(-dump.restart_index);
+		if (current_time >= stats.begin) {	// statistics //
+			stats.write_dump(
+				-dump.restart_index, current_time, grid);
+		}
+
+		if (grid.mpi_com.rank == 0) printf("OK!\n");
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::print_info(const std::string& filename,
+	const char* msg_status)
+{
+	T u_dynamic;	// dynamic (friction) velocity, average at z=0 & z=H //
+
+					// spacing in viscous wall units //
+	T dx_visc, dy_visc;
+	T dz_visc_min, dz_visc_max;
+	T dt_visc;
+
+	T Cf_dynamic;	// friction coefficient //
+
+	if (current_time >= stats.begin) {
+
+		// friction [dynamic] velocity //
+		u_dynamic = dynamic_velocity();
+
+		// Spacing in viscous units //
+		dx_visc = grid.dx * (u_dynamic / fluid.kinematic_viscosity);
+		dy_visc = grid.dy * (u_dynamic / fluid.kinematic_viscosity);
+		dz_visc_min = grid.dz_min * (u_dynamic / fluid.kinematic_viscosity);
+		dz_visc_max = grid.dz_max * (u_dynamic / fluid.kinematic_viscosity);
+		dt_visc = dt *
+			((u_dynamic * u_dynamic) / fluid.kinematic_viscosity);
+
+		// friction coefficient //
+		Cf_dynamic = ((T)2.0 * u_dynamic * u_dynamic) /
+			(((T)0.5*fluid.Umax) * ((T)0.5*fluid.Umax));
+	}
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+		FILE* ptr = fopen(filename.c_str(), "w");
+		if (ptr != NULL) {
+
+#ifndef SCHEME_X4
+			fprintf(ptr, " - model: DNS -x2 [Couette Flow]\n");
+#else
+			fprintf(ptr, " - model: DNS -x4 [Couette Flow]\n");
+#endif
+#ifdef STRATIFICATION
+			fprintf(ptr, " - density stratification: yes\n");
+#else
+			fprintf(ptr, " - density stratification: no\n");
+#endif
+
+			fprintf(ptr, " - status: %s\n", msg_status);
+			if (current_time >= stats.begin)
+			{
+				fprintf(ptr, " - statistics mode: on [%.4f - %.4f]\n", stats.begin, current_time);
+				fprintf(ptr, " \t stats dt = %.7f\n", stats.dt);
+				fprintf(ptr, " \t directory = %s\n\n", stats.output.DIR.c_str());
+			}
+			else
+				fprintf(ptr, " - statistics mode: off, current time = %.4f\n\n", current_time);
+
+			fprintf(ptr, " - grid: wall stretched 3D\n");
+			fprintf(ptr, " \t x = %.4f, y = %.4f, z = %.4f\n",
+				grid.mpi_x, grid.mpi_y, grid.mpi_z);
+			fprintf(ptr, " \t length = %.4f, width = %.4f, height = %.4f\n",
+				grid.mpi_length, grid.mpi_width, grid.mpi_height);
+			fprintf(ptr, " \t nx = %i, ny = %i, nz = %i, size = %i\n",
+				grid.mpi_nx, grid.mpi_ny, grid.mpi_nz, grid.mpi_size);
+			fprintf(ptr, " \t gcx = %i, gcy = %i, gcz = %i\n", grid.gcx, grid.gcy, grid.gcz);
+			fprintf(ptr, " \t dx = %.7f, dy = %.7f, dz(min) = %.7f, dz(max) = %.7f\n", grid.dx, grid.dy, grid.dz_min, grid.dz_max);
+			fprintf(ptr, " \t ksi(z) = %.7f\n", grid.ksi_z);
+			if (current_time >= stats.begin)
+				fprintf(ptr, " \t dx+ = %.6f, dy+ = %.6f, dz+(min) = %.6f, dz+(max) = %.6f\n",
+					dx_visc, dy_visc, dz_visc_min, dz_visc_max);
+			fprintf(ptr, "\n");
+
+			fprintf(ptr, " - time\n");
+			fprintf(ptr, " \t adams-bashforth order: %i\n", AB_TYPE);
+			fprintf(ptr, " \t begin = %.4f, end = %.4f\n", begin_time, end_time);
+			fprintf(ptr, " \t CFL = %.7f\n", CFL);
+			fprintf(ptr, " \t dt = %.7f\n", dt);
+			if (current_time >= stats.begin)
+				fprintf(ptr, " \t dt+ = %.6f\n", dt_visc);
+			fprintf(ptr, "\n");
+
+			if (dump.restart) {
+				fprintf(ptr, " - starting with dump[%i], directory = %s\n\n",
+					dump.restart_index, dump.DIR.c_str());
+			}
+			else
+			{
+				if (startup.load_files) {
+					fprintf(ptr, " - starting with initial field, directory = %s\n\n",
+						startup.DIR.c_str());
+				}
+				else
+				{
+					fprintf(ptr, " - starting with initial disturbance, amplitude = %.7f\n\n",
+						fluid.disturbance_amp * fluid.Umax);
+				}
+			}
+
+			fprintf(ptr, " - fluid\n");
+			fprintf(ptr, " \t wall velocity difference = %.7f\n", fluid.Umax);
+			if (current_time >= stats.begin) {
+				fprintf(ptr, " \t u*, dynamic velocity = %.7f\n", u_dynamic);
+				fprintf(ptr, " \t Cf, friction coefficient = %.7f\n", Cf_dynamic);
+				fprintf(ptr, " \t Re-tau, dynamic Reynolds number = %.7f\n",
+					(u_dynamic * grid.mpi_height) / fluid.kinematic_viscosity);
+			}
+			fprintf(ptr, " \t density = %.7f\n", fluid.density);
+			fprintf(ptr, " \t viscosity = %.7f\n", fluid.viscosity);
+			fprintf(ptr, " \t Reynolds number = %.7f\n",
+				(fluid.Umax * grid.mpi_height) / fluid.kinematic_viscosity);
+#ifdef STRATIFICATION
+			fprintf(ptr, " \t wall temperature difference = %.7f\n", fluid.TH - fluid.T0);
+			fprintf(ptr, " \t Prandtl number = %.7f\n", fluid.Prandtl);
+			fprintf(ptr, " \t Richardson number = %.7f [init = %.7f]\n", fluid.Richardson, fluid.Richardson_init);
+			fprintf(ptr, " \t diffusivity = %.7f\n", fluid.diffusivity);
+			fprintf(ptr, " \t gravity mode = [init = %.4f, period = %.4f]\n", fluid.T_gravity_init, fluid.T_gravity_period);
+#endif
+			fprintf(ptr, "\n");
+
+			fprintf(ptr, " - poisson eq. solver: cg muligrid sgs red-black [ %i ]\n",
+				mg_data.num_grids);
+#ifdef PRESSURE_MEAN_CTRL
+			fprintf(ptr, " \t removing pressure mean: yes [eps = %.7f]\n",
+				model_const::pressure_mean_eps);
+#else
+			fprintf(ptr, " \t removing pressure mean: no\n");
+#endif
+			if (pois_opt.norm_mode == isNormC)
+				fprintf(ptr, " \t poisson norm = [C]\n");
+			else
+				if (pois_opt.norm_mode == isNormL2)
+					fprintf(ptr, " \t poisson norm = [L2]\n");
+			fprintf(ptr, " \t relative tolerance = %.15f\n", pois_opt.retol);
+			fprintf(ptr, " \t absolute tolerance = %.15f\n", pois_opt.abstol);
+			fprintf(ptr, " \t min-max iterations = %i - %i\n", pois_opt.miniters, pois_opt.maxiters);
+			fprintf(ptr, " \t preconditioner iterations = %i\n", pois_opt.piters);
+			fprintf(ptr, " \t multigrid -V cycle iterations: [down = %i, up = %i, direct = %i]\n",
+				mg_opt.down_iters, mg_opt.up_iters, mg_opt.direct_iters);
+			fprintf(ptr, " \t multigrid omega relaxation = %.5f [fine = %.5f]\n\n",
+				mg_opt.smooth_up_omega, mg_opt.smooth_up_omega_fine);
+
+#ifdef INCLUDE_PARTICLES
+			fprintf(ptr, "- particles settings\n");
+			fprintf(ptr, " \t n = %i\n", ptcl_opt.n);
+			fprintf(ptr, " \t begin = %.4f\n", ptcl_opt.begin);
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+			fprintf(ptr, "- particles track settings\n");
+			fprintf(ptr, " \t n = %i\n", ptcl_track_opt.n);
+			fprintf(ptr, " \t begin = %.4f\n", ptcl_track_opt.begin);
+			fprintf(ptr, " \t group max size = %i\n",
+				ptcl_track_opt.group_max_size);
+			fprintf(ptr, " \t max memory (in bytes) = %i\n\n",
+				ptcl_track_opt.max_memory);
+#endif
+
+			fprintf(ptr, " - data type size: %i\n\n", (int) sizeof(T));
+
+			fprintf(ptr, " - openmp cores: %i [ of %i ]\n\n",
+				omp_get_max_threads(), omp_get_num_procs());
+
+			fprintf(ptr, " - mpi communicator: %i [%i - %i - %i]\n\n",
+				grid.mpi_com.size,
+				grid.mpi_com.size_x, grid.mpi_com.size_y, grid.mpi_com.size_z);
+
+			fprintf(ptr, " - cpu run time = %.5f\n", cpu_time.run);
+			fprintf(ptr, " \t nse equation = %.5f\n", cpu_time.nse_eq);
+			fprintf(ptr, " \t nse poisson solver = %.5f\n", cpu_time.pois);
+#ifdef STRATIFICATION
+			fprintf(ptr, " \t heat equation = %.5f\n", cpu_time.heat_eq);
+#endif
+#ifdef INCLUDE_PARTICLES
+			fprintf(ptr, " \t particles = %.5f\n", cpu_time.particles);
+#ifdef MEASURE_PARTICLE3D_TIME
+			fprintf(ptr, " \t\t grid location = %.5f\n", ptcl.time.locate);
+			fprintf(ptr, " \t\t interpolation = %.5f\n", ptcl.time.interpolate);
+			fprintf(ptr, " \t\t coordinates update = %.5f\n", ptcl.time.update);
+			fprintf(ptr, " \t\t mpi exchanges = %.5f\n", ptcl.time.mpi_exch);
+#endif
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+			fprintf(ptr, " \t particles tracking = %.5f\n", cpu_time.particles_tracking);
+#ifdef MEASURE_PARTICLE3D_TIME
+			fprintf(ptr, " \t\t grid location = %.5f\n", ptcl_track.time.locate);
+			fprintf(ptr, " \t\t interpolation = %.5f\n", ptcl_track.time.interpolate);
+			fprintf(ptr, " \t\t coordinates update = %.5f\n", ptcl_track.time.update);
+			fprintf(ptr, " \t\t mpi exchanges = %.5f\n", ptcl_track.time.mpi_exch);
+#endif
+#endif
+			fprintf(ptr, " \t stat gathering = %.5f\n", cpu_time.stats);
+			fprintf(ptr, " \t mpi exchange = %.5f\n", grid.mpi_com.cpu_time_exch);
+			fprintf(ptr, " \t\t mpi -x exchange = %.5f\n", grid.mpi_com.cpu_time_exch_x);
+			fprintf(ptr, " \t\t mpi -y exchange = %.5f\n", grid.mpi_com.cpu_time_exch_y);
+			fprintf(ptr, " \t\t mpi -z exchange = %.5f\n\n", grid.mpi_com.cpu_time_exch_z);
+
+#ifdef MEASURE_MG_RUN_TIME
+			double mg_run_time = (double)0;
+			for (int k = 0; k < mg_data.num_grids; k++)
+				mg_run_time += mg_data.run_time[k];
+			fprintf(ptr, " - multigrid(%i) run time = %.5f\n", mg_data.num_grids, mg_run_time);
+			for (int k = 0; k < mg_data.num_grids; k++) {
+				fprintf(ptr, " \t grid(%i) = %.5f\n", k, mg_data.run_time[k]);
+				fprintf(ptr, " \t\t smooth = %.5f, restrict = %.5f, prolongate = %.5f\n",
+					mg_data.smooth_time[k], mg_data.restrict_time[k], mg_data.prolongate_time[k]);
+			}
+			fprintf(ptr, "\n");
+#endif
+
+			status = 1;
+			fclose(ptr);
+		}
+	}
+
+	mpi_broadcast(&status, 1, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::print_info(
+	const std::string& filename, const int idx,
+	const char* msg_status)
+{
+	return print_info(append_index(filename, idx), msg_status);
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/model-setup.hpp b/model-setup.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5eb5420897456dc63bfa6fc41afb22ab0f645913
--- /dev/null
+++ b/model-setup.hpp
@@ -0,0 +1,1116 @@
+#include "model-obj.h"
+
+#include "model-const.h"
+#include "model-default.h"
+
+#include "str-com.h"
+#include "nse-io3d.h"
+
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T, nse::memType mem >
+nse::modelObj<T, mem>::modelObj() : use_config(false) {}
+// -------------------------------------------------------------------------------------------- //
+template< typename T, nse::memType mem >
+nse::modelObj<T, mem>::~modelObj() { clear(); }
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::init_parallel(
+	int argc, char** argv, const int mpi_mode)
+{
+#ifdef SET_OPENMP_THREADS
+	omp_set_num_threads(SET_OPENMP_THREADS);
+#endif
+
+	int provided_mode;
+	int status = MPI_Init_thread(&argc, &argv, mpi_mode, &provided_mode);
+	if (status != MPI_SUCCESS) {
+		printf("\n >> FAILURE! >> ** MPI init **\n\n");
+		return false;
+	}
+
+	if (provided_mode != mpi_mode)
+	{
+		int mpi_rank;
+		MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+		if (mpi_rank == 0)
+			printf("\n >> FAILURE! >> ** MPI init - thread support **\n\n");
+
+		MPI_Finalize();
+		return false;
+	}
+
+	mpiCom3d::init();
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::finalize_parallel()
+{
+	mpiCom3d::clear();
+	MPI_Finalize();
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::setup(int argc, char** argv,
+	const char* config_filename, const MPI_Comm comm)
+{
+	int mpi_rank;
+	MPI_Comm_rank(comm, &mpi_rank);
+
+	if (!config.mpi_run(config_filename, comm))
+	{
+		if (mpi_rank == 0)
+			printf("\n >> FAILURE! >> ** failed to read configure file **\n\n");
+		return false;
+	}
+
+	if (mpi_rank == 0)
+		printf(" \t >> Using configure file: \"%s\"\n", config_filename);
+
+	use_config = true;
+	return setup(argc, argv, comm);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::setup(int argc, char** argv, const MPI_Comm comm)
+{
+	int mpi_rank;
+	MPI_Comm_rank(comm, &mpi_rank);
+
+	cpu_time.begin_mark = omp_get_wtime();
+
+
+	if (!process_args(argc, argv)) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0) {
+		printf(" \t >> Processing arguments: OK!\n");
+
+		if (dump.restart)
+			printf(" \t\t - restarting, dump index = %i\n", dump.restart_index);
+		if (startup.load_files)
+			printf(" \t\t - loading initial fields from files\n");
+		if (dump.edge_mode)
+			printf(" \t\t - run time limit specified: %.2f min\n", dump.max_run_time / (double)60.0);
+		if (print_dump.active)
+			printf(" \t\t - printing dump, index = %i\n", print_dump.index);
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	if (!set_working_paths(comm)) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf(" \t >> Setting working paths: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+	if (!set_fluid(comm)) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf("\t >> Fluid: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+	if (!set_grid(comm)) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf("\t >> Grid: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+	if (!set_time()) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf("\t >> Time: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+	if (!set_io_parameters()) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf("\t >> I/O parameters: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+	if (!allocate_fields()) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf("\t >> Data allocation: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+	if (!set_poisson_solver()) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf("\t >> Poisson solver: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+	if (!set_time_series()) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf("\t >> Time series: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+	if (!set_statistics_block()) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf(" \t >> Statistics block: OK!\n");
+	// -------------------------------------------------------------------------------------------- //
+
+#ifdef INCLUDE_PARTICLES
+	if (!set_particles()) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf(" \t >> Particles: OK!\n");
+	// -------------------------------------------------------------------------------------------- //s
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+	if (!set_track_particles()) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf(" \t >> Particles tracking: OK!\n");
+	// -------------------------------------------------------------------------------------------- //s
+#endif
+
+#ifdef INCLUDE_VISUALRTL
+	if (!set_visualization()) {
+		clear(); return false;
+	}
+	if (mpi_rank == 0)
+		printf(" \t >> Visualization: OK!\n");
+	// -------------------------------------------------------------------------------------------- //s
+#endif
+
+	if (mpi_rank == 0) {
+		printf("\n\t >> MPI Comm: %i [%i - %i - %i]\n",
+			grid.mpi_com.size,
+			grid.mpi_com.size_x, grid.mpi_com.size_y, grid.mpi_com.size_z);
+		printf("\t >> OpenMP threads: %i [ of %i ]\n",
+			omp_get_max_threads(), omp_get_num_procs());
+	}
+	// -------------------------------------------------------------------------------------------- //
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::process_args(
+	int argc, char** argv)
+{
+	int parg = 2;
+
+	while (parg <= argc) {	// processing arguments //
+		if ((!strcmp(argv[parg - 1], "-ltime")) &&
+			(parg + 1 <= argc))	// time-limiter //
+		{
+			int run_limit = atoi(argv[parg]);	// in minutes //
+			if (run_limit > 0) {
+				dump.max_run_time = (double)60.0 * run_limit;	// in seconds //
+				dump.edge_mode = true;
+			}
+			parg += 2;
+			continue;
+		}
+
+		if ((!strcmp(argv[parg - 1], "-udump")) &&
+			(parg + 1 <= argc))	// restart from dump //
+		{
+			dump.restart_index = atoi(argv[parg]);
+			if (dump.restart_index >= 0)
+				dump.restart = true;
+
+			parg += 2;
+			continue;
+		}
+
+		if (!strcmp(argv[parg - 1], "-uinit"))
+		{
+			startup.load_files = true;
+
+			parg++;
+			continue;
+		}
+
+		if ((!strcmp(argv[parg - 1], "-wdump")) &&
+			(parg + 2 <= argc))	// write dump //
+		{
+			dump.restart_index = atoi(argv[parg]);
+			print_dump.index = atoi(argv[parg + 1]);
+			if (dump.restart_index >= 0) {
+				dump.restart = true;
+				print_dump.active = true;
+
+				if (parg + 5 <= argc) {	// additional arguments
+					if (!strcmp(argv[parg + 2], "-t")) {
+						print_dump.stats_begin = (T)atof(argv[parg + 3]);
+						print_dump.stats_end = (T)atof(argv[parg + 4]);
+						print_dump.stats_range = true;
+
+						parg += 3;
+					}
+				}
+			}
+
+			parg += 3;
+			continue;
+		}
+		parg++;
+	}
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_working_paths(const MPI_Comm comm)
+{
+	bool status = true;
+
+	std::string OUTPUT_DIR = model_default::output::DIR;
+	std::string OUTPUT_PF_DIR = model_default::output::profiles::DIR;
+	std::string DUMP_DIR = model_default::dump::DIR;
+	std::string STARTUP_DIR = model_default::startup::DIR;
+#ifdef INCLUDE_VISUALRTL
+	std::string VISUAL_DIR = model_default::visual::DIR;
+#endif
+
+	std::string STATS_OUTPUT_DIR = model_default::stats::output::DIR;
+	std::string STATS_DUMP_DIR = model_default::stats::dump::DIR;
+
+	if (use_config) {
+		config.mpi_get_value("output.DIR", OUTPUT_DIR, comm);
+		config.mpi_get_value("output.profiles.DIR", OUTPUT_PF_DIR, comm);
+		config.mpi_get_value("dump.DIR", DUMP_DIR, comm);
+		config.mpi_get_value("startup.DIR", STARTUP_DIR, comm);
+#ifdef INCLUDE_VISUALRTL
+		config.mpi_get_value("visual.DIR", VISUAL_DIR, comm);
+#endif
+
+		config.mpi_get_value("stats.output.DIR", STATS_OUTPUT_DIR, comm);
+		config.mpi_get_value("stats.dump.DIR", STATS_DUMP_DIR, comm);
+	}
+
+	if (!output.set_filenames(OUTPUT_DIR)) status = false;
+	if (!pf_output.set_filenames(OUTPUT_PF_DIR)) status = false;
+	if (!dump.set_filenames(DUMP_DIR)) status = false;
+	if (!startup.set_filenames(STARTUP_DIR)) status = false;
+#ifdef INCLUDE_VISUALRTL
+	if (!visual.set_filenames(VISUAL_DIR)) status = false;
+#endif
+
+	if (!stats.output.set_filenames(STATS_OUTPUT_DIR)) status = false;
+	if (!stats.dump.set_filenames(STATS_DUMP_DIR)) status = false;
+
+	if (!status) {
+		int mpi_rank;
+		MPI_Comm_rank(comm, &mpi_rank);
+		if (mpi_rank == 0)
+			printf("\n >> FAILURE! >> ** incorrect working paths **\n\n");
+	}
+
+	return status;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_fluid(const MPI_Comm comm)
+{
+	fluid.Umax = model_default::fluid::Umax;
+
+	fluid.disturbance_amp = model_default::fluid::disturbance_amp;
+
+	fluid.density = model_default::fluid::density;
+	fluid.viscosity = model_default::fluid::viscosity;
+
+#ifdef STRATIFICATION
+	fluid.T0 = model_default::fluid::T0;
+	fluid.TH = model_default::fluid::TH;
+
+	fluid.Prandtl = model_default::fluid::Prandtl;
+	fluid.Richardson = model_default::fluid::Richardson;
+	fluid.Richardson_init = model_default::fluid::Richardson_init;
+
+	fluid.T_gravity_init = model_default::fluid::T_gravity_init;
+	fluid.T_gravity_period = model_default::fluid::T_gravity_period;
+#endif
+
+	if (use_config) {
+		config.mpi_get_value("fluid.Umax", &fluid.Umax, comm);
+
+		config.mpi_get_value("fluid.disturbance_amp", &fluid.disturbance_amp, comm);
+
+		config.mpi_get_value("fluid.density", &fluid.density, comm);
+		config.mpi_get_value("fluid.viscosity", &fluid.viscosity, comm);
+
+#ifdef STRATIFICATION
+		config.mpi_get_value("fluid.T0", &fluid.T0, comm);
+		config.mpi_get_value("fluid.TH", &fluid.TH, comm);
+
+		config.mpi_get_value("fluid.Prandtl", &fluid.Prandtl, comm);
+		config.mpi_get_value("fluid.Richardson", &fluid.Richardson, comm);
+		config.mpi_get_value("fluid.Richardson_init", &fluid.Richardson_init, comm);
+
+		config.mpi_get_value("fluid.T_gravity_init", &fluid.T_gravity_init, comm);
+		config.mpi_get_value("fluid.T_gravity_period", &fluid.T_gravity_period, comm);
+#endif
+	}
+
+	fluid.kinematic_viscosity = fluid.viscosity / fluid.density;
+#ifdef STRATIFICATION
+	fluid.diffusivity = fluid.kinematic_viscosity * ((T) 1.0 / fluid.Prandtl);
+#endif
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_grid(const MPI_Comm comm)
+{
+	T x = model_default::domain::x,
+		y = model_default::domain::y,
+		z = model_default::domain::z;
+	T length = model_default::domain::length,
+		width = model_default::domain::width,
+		height = model_default::domain::height;
+
+	int cx = model_default::grid::cx,
+		cy = model_default::grid::cy,
+		cz = model_default::grid::cz;
+	T ksi_z = model_default::grid::ksi_z;
+
+	int mpi_dimx = model_default::mpi_setup::dimx,
+		mpi_dimy = model_default::mpi_setup::dimy,
+		mpi_dimz = model_default::mpi_setup::dimz;
+
+
+	if (use_config) {
+		config.mpi_get_value("domain.x", &x, comm);
+		config.mpi_get_value("domain.y", &y, comm);
+		config.mpi_get_value("domain.z", &z, comm);
+
+		config.mpi_get_value("domain.length", &length, comm);
+		config.mpi_get_value("domain.width", &width, comm);
+		config.mpi_get_value("domain.height", &height, comm);
+
+		config.mpi_get_value("grid.cx", &cx, comm);
+		config.mpi_get_value("grid.cy", &cy, comm);
+		config.mpi_get_value("grid.cz", &cz, comm);
+		config.mpi_get_value("grid.ksi_z", &ksi_z, comm);
+
+		config.mpi_get_value("mpi_setup.dimx", &mpi_dimx, comm);
+		config.mpi_get_value("mpi_setup.dimy", &mpi_dimy, comm);
+		config.mpi_get_value("mpi_setup.dimz", &mpi_dimz, comm);
+	}
+
+	if (!grid.set(
+		x, y, z, length, width, height,
+
+		ksi_z,
+		cx, cy, cz,
+		model_const::grid::gcx, model_const::grid::gcy, model_const::grid::gcz,
+
+		mpi_dimx, mpi_dimy, mpi_dimz))
+	{
+		int mpi_rank;
+		MPI_Comm_rank(comm, &mpi_rank);
+		if (mpi_rank == 0)
+			printf("\n >> FAILURE! >> ** incorrect grid parameters **\n\n");
+		return false;
+	}
+
+	allocation_status.grid = true;
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_time()
+{
+	// model time 
+	begin_time = model_default::time::begin;
+	end_time = model_default::time::end;
+	CFL = model_default::time::CFL;
+
+	if (use_config) {
+		config.mpi_get_value("time.begin", &begin_time, grid.mpi_com.comm);
+		config.mpi_get_value("time.end", &end_time, grid.mpi_com.comm);
+		config.mpi_get_value("time.CFL", &CFL, grid.mpi_com.comm);
+	}
+
+	dt = CFL * (grid.dx / fluid.Umax);
+
+	current_time = begin_time;
+	time_index = 0;
+
+	// stats time
+	stats.begin = model_default::stats::begin;
+	stats.time_mod = model_default::stats::time_mod;
+
+	if (use_config) {
+		config.mpi_get_value("stats.begin", &stats.begin, grid.mpi_com.comm);
+		config.mpi_get_value("stats.time_mod", &stats.time_mod, grid.mpi_com.comm);
+	}
+
+	stats.dt = stats.time_mod * dt;
+	stats.time_index = 0;
+
+
+	if (dump.restart)
+	{
+		binStamp< int > index_stamp;
+		binStamp< double > cpu_stamp;
+
+		bool status = read_binary_stamp(dump.NSE_STAMP_FILE, dump.restart_index,
+			index_stamp, cpu_stamp, grid, &current_time);
+
+		if (status) {
+			index_stamp.get(0, &time_index);
+			index_stamp.get(1, &stats.time_index);
+
+			cpu_stamp.update(0, &cpu_time.run);
+			cpu_stamp.update(1, &cpu_time.nse_eq);
+			cpu_stamp.update(2, &cpu_time.pois);
+			cpu_stamp.update(3, &cpu_time.stats);
+			cpu_stamp.update(4, &grid.mpi_com.cpu_time_exch);
+#ifdef STRATIFICATION
+			cpu_stamp.update(5, &cpu_time.heat_eq);
+
+			cpu_stamp.update(6, &grid.mpi_com.cpu_time_exch_x);
+			cpu_stamp.update(7, &grid.mpi_com.cpu_time_exch_y);
+			cpu_stamp.update(8, &grid.mpi_com.cpu_time_exch_z);
+			int stamp_idx = 9;
+#else
+			cpu_stamp.update(5, &grid.mpi_com.cpu_time_exch_x);
+			cpu_stamp.update(6, &grid.mpi_com.cpu_time_exch_y);
+			cpu_stamp.update(7, &grid.mpi_com.cpu_time_exch_z);
+			int stamp_idx = 8;
+#endif
+#ifdef INCLUDE_PARTICLES
+			cpu_stamp.update(stamp_idx, &cpu_time.particles); stamp_idx++;
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+			cpu_stamp.update(stamp_idx, &cpu_time.particles_tracking); stamp_idx++;
+#endif
+		}
+		else
+		{
+			if (grid.mpi_com.rank == 0) {
+				printf("\n >> WARNING! >> ** failed to read dump stamp: ""%s"" **\n",
+					dump.NSE_STAMP_FILE.c_str());
+				printf("\t >> -- setting default time stamp\n\n");
+			}
+		}
+	}
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_io_parameters()
+{
+	// - output setup //
+	output.begin = model_default::output::begin;
+	output.dt = model_default::output::dt;
+
+	if (use_config) {
+		config.mpi_get_value("output.begin", &output.begin, grid.mpi_com.comm);
+		config.mpi_get_value("output.dt", &output.dt, grid.mpi_com.comm);
+	}
+
+	output.mark = output.begin;
+	output.index = 1;
+	if (dump.restart) {
+		while (current_time >= output.mark) {
+			output.mark += output.dt;
+			output.index++;
+		}
+	}
+
+	output.xmin = grid.mpi_x;
+	output.xmax = grid.mpi_x + grid.mpi_length;
+	output.ymin = grid.mpi_y;
+	output.ymax = grid.mpi_y + grid.mpi_width;
+	output.zmin = grid.mpi_z;
+	output.zmax = grid.mpi_z + grid.mpi_height;
+
+	output.regular_plt3d_cntrl = model_default::output::regular_plt3d_cntrl;
+	output.regular_bin3d_cntrl = model_default::output::regular_bin3d_cntrl;
+	output.final_plt3d_cntrl = model_default::output::final_plt3d_cntrl;
+
+	nscreen = model_default::output::nscreen;
+
+	if (use_config) {
+		config.mpi_get_value("output.xmin", &output.xmin, grid.mpi_com.comm);
+		config.mpi_get_value("output.xmax", &output.xmax, grid.mpi_com.comm);
+		config.mpi_get_value("output.ymin", &output.ymin, grid.mpi_com.comm);
+		config.mpi_get_value("output.ymax", &output.ymax, grid.mpi_com.comm);
+		config.mpi_get_value("output.zmin", &output.zmin, grid.mpi_com.comm);
+		config.mpi_get_value("output.zmax", &output.zmax, grid.mpi_com.comm);
+
+		config.mpi_get_value("output.regular_plt3d_cntrl",
+			&output.regular_plt3d_cntrl, grid.mpi_com.comm);
+		config.mpi_get_value("output.regular_bin3d_cntrl",
+			&output.regular_bin3d_cntrl, grid.mpi_com.comm);
+		config.mpi_get_value("output.final_plt3d_cntrl",
+			&output.final_plt3d_cntrl, grid.mpi_com.comm);
+
+		config.mpi_get_value("output.nscreen", &nscreen, grid.mpi_com.comm);
+	}
+
+
+	// - output rapid setup //
+	pf_output.begin = model_default::output::profiles::begin;
+	pf_output.end = model_default::output::profiles::end;
+	pf_output.dt = model_default::output::profiles::dt;
+
+	pf_output.plt_cntrl = model_default::output::profiles::plt_cntrl;
+
+	if (use_config) {
+		config.mpi_get_value("output.profiles.begin",
+			&pf_output.begin, grid.mpi_com.comm);
+		config.mpi_get_value("output.profiles.end",
+			&pf_output.end, grid.mpi_com.comm);
+		config.mpi_get_value("output.profiles.dt",
+			&pf_output.dt, grid.mpi_com.comm);
+
+		config.mpi_get_value("output.profiles.plt_cntrl",
+			&pf_output.plt_cntrl, grid.mpi_com.comm);
+	}
+
+	pf_output.mark = pf_output.begin;
+	pf_output.index = 1;
+	if (dump.restart) {
+		while ((current_time >= pf_output.mark) &&
+			(pf_output.mark <= pf_output.end))
+		{
+			pf_output.mark += pf_output.dt;
+			pf_output.index++;
+		}
+	}
+
+	pf_output.allocate(grid);
+
+
+	// - dump setup //
+	dump.begin = model_default::dump::begin;
+	dump.dt = model_default::dump::dt;
+
+	if (use_config) {
+		config.mpi_get_value("dump.begin", &dump.begin, grid.mpi_com.comm);
+		config.mpi_get_value("dump.dt", &dump.dt, grid.mpi_com.comm);
+	}
+
+	dump.mark = dump.begin;
+	dump.index = 1;
+	if (dump.restart) {
+		while (current_time >= dump.mark) {
+			dump.mark += dump.dt;
+		}
+		dump.index = dump.restart_index + 1;
+	}
+
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::allocate_fields()
+{
+	if (!allocate_base_fields()) return false;
+	if (!allocate_ext_fields()) return false;
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::allocate_base_fields()
+{
+	allocate_vnull(&U, &V, &W, grid.size);
+	allocate_vnull(&Pressure, grid.size);
+
+	allocate_vnull(&Uim, &Vim, &Wim, grid.size);
+	allocate_vnull(&Uim_p, &Vim_p, &Wim_p, grid.size);
+#if (AB_TYPE==3)
+	allocate_vnull(&Uim_pp, &Vim_pp, &Wim_pp, grid.size);
+#endif
+
+	allocate_vnull(&Phi, grid.size);
+	allocate_vnull(&Rhs, grid.size);
+	allocate_vnull(&Divergence, grid.size);
+
+#ifdef STRATIFICATION
+	allocate_vnull(&Tx, grid.size);
+	allocate_vnull(&Tsh, grid.size);
+	allocate_vnull(&Tline, grid.nz);	// temperature -z profile //
+
+	allocate_vnull(&Tim, grid.size); allocate_vnull(&Tim_p, grid.size);
+#if (AB_TYPE==3)
+	allocate_vnull(&Tim_pp, grid.size);
+#endif
+#endif
+
+	allocation_status.base_fields = true;
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::allocate_ext_fields()
+{
+	allocate_vnull(&U2_u, &V2_v, &W2_w, grid.size);
+	allocate_vnull(&U2_uw, &V2_vw, &W2_c, grid.size);
+
+	allocate_vnull(&W2_u, &W2_v, grid.size);
+	allocate_vnull(&W2_uw, &W2_vw, grid.size);
+
+	allocate_vnull(&UV, &UW, &VW, grid.size);
+	allocate_vnull(&UV_uvw, &UW_uvw, &VW_uvw, grid.size);
+	allocate_vnull(&PU, &PV, &PW, grid.size);
+
+	allocate_vnull(&UW_bottom, &UW_top, grid.size);
+	allocate_vnull(&VW_bottom, &VW_top, grid.size);
+	allocate_vnull(&UW_bottom_uv, &UW_top_uv, grid.size);
+	allocate_vnull(&VW_bottom_uv, &VW_top_uv, grid.size);
+	allocate_vnull(&UW_bottom_uw, &UW_top_uw, grid.size);
+	allocate_vnull(&VW_bottom_vw, &VW_top_vw, grid.size);
+	allocate_vnull(&UW_adv, &VW_adv, grid.size);
+
+	allocate_vnull(&U2W, &V2W, &W2W, grid.size);
+	allocate_vnull(&UVW, &UWW, &VWW, grid.size);
+
+	allocate_vnull(&U_diff, &V_diff, &W_diff, grid.size);
+	allocate_vnull(&U_diss, &V_diss, &W_diss, grid.size);
+	allocate_vnull(&UV_diss, &UW_diss, &VW_diss, grid.size);
+
+	allocate_vnull(&U_iso_diss, &V_iso_diss, &W_iso_diss, grid.size);
+	allocate_vnull(&UV_iso_diss, &UW_iso_diss, &VW_iso_diss, grid.size);
+	allocate_vnull(&iso_diss_x, &iso_diss_y, &iso_diss_z, grid.size);
+
+	allocate_vnull(&PSuu, &PSvv, &PSww, grid.size);
+	allocate_vnull(&P2Suv, &P2Suw, &P2Svw, grid.size);
+
+
+#ifdef STRATIFICATION
+	allocate_vnull(&T2_c, &T2_w, grid.size);
+
+	allocate_vnull(&TU, &TV, &TW, grid.size);
+	allocate_vnull(&TU_uw, &TV_vw, grid.size);
+	allocate_vnull(&TW_uw, &TW_vw, grid.size);
+	allocate_vnull(&TP, grid.size);
+
+	allocate_vnull(&TW_bottom, &TW_top, grid.size);
+	allocate_vnull(
+		&TW_bottom_u, &TW_top_u,
+		&TW_bottom_v, &TW_top_v,
+		&TW_bottom_w, &TW_top_w, grid.size);
+	allocate_vnull(&TW_adv, grid.size);
+
+	allocate_vnull(&T2W, grid.size);
+	allocate_vnull(&TUW, &TVW, &TWW, grid.size);
+
+	allocate_vnull(&T_diff, grid.size);
+	allocate_vnull(&T_diss, grid.size);
+	allocate_vnull(&TU_diss, &TV_diss, &TW_diss, grid.size);
+
+	allocate_vnull(&T_iso_diss, grid.size);
+
+	allocate_vnull(&T_dPdx, &T_dPdy, &T_dPdz, grid.size);
+#endif
+
+	allocation_status.ext_fields = true;
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_poisson_solver()
+{
+	pois_opt.retol = model_default::poisson::retol;
+	pois_opt.abstol = model_default::poisson::abstol;
+
+	pois_opt.miniters = model_default::poisson::miniters;
+	pois_opt.maxiters = model_default::poisson::maxiters;
+
+	pois_opt.init_mode = model_const::poisson::init_mode;
+	pois_opt.norm_mode = model_const::poisson::norm_mode;
+
+	pois_opt.bc_type = model_const::poisson::bc_type;
+	pois_opt.piters = model_default::poisson::piters;
+
+	if (use_config) {
+		config.mpi_get_value("poisson.retol", &pois_opt.retol, grid.mpi_com.comm);
+		config.mpi_get_value("poisson.abstol", &pois_opt.abstol, grid.mpi_com.comm);
+
+		config.mpi_get_value("poisson.miniters", &pois_opt.miniters, grid.mpi_com.comm);
+		config.mpi_get_value("poisson.maxiters", &pois_opt.maxiters, grid.mpi_com.comm);
+
+		config.mpi_get_value("poisson.piters", &pois_opt.piters, grid.mpi_com.comm);
+	}
+
+	for (int k = 0; k < 6; k++)	// poisson memory allocation
+		allocate_vnull(&memory[k], grid.size);
+
+
+	// poisson multigrid setup //
+	int pois_ngrid = model_default::poisson::multigrid::ngrid;
+	mg_opt.down_iters =
+		model_default::poisson::multigrid::down_iters;
+	mg_opt.up_iters =
+		model_default::poisson::multigrid::up_iters;
+	mg_opt.direct_iters =
+		model_default::poisson::multigrid::direct_iters;
+	mg_opt.smooth_up_omega =
+		model_default::poisson::multigrid::smooth_up_omega;
+	mg_opt.smooth_up_omega_fine =
+		model_default::poisson::multigrid::smooth_up_omega_fine;
+
+	if (use_config) {
+		config.mpi_get_value("poisson.multigrid.ngrid",
+			&pois_ngrid, grid.mpi_com.comm);
+
+		config.mpi_get_value("poisson.multigrid.down_iters",
+			&mg_opt.down_iters, grid.mpi_com.comm);
+		config.mpi_get_value("poisson.multigrid.up_iters",
+			&mg_opt.up_iters, grid.mpi_com.comm);
+		config.mpi_get_value("poisson.multigrid.direct_iters",
+			&mg_opt.direct_iters, grid.mpi_com.comm);
+
+		config.mpi_get_value("poisson.multigrid.smooth_up_omega",
+			&mg_opt.smooth_up_omega, grid.mpi_com.comm);
+		config.mpi_get_value("poisson.multigrid.smooth_up_omega_fine",
+			&mg_opt.smooth_up_omega_fine, grid.mpi_com.comm);
+	}
+
+	if (pois_ngrid == 0) {
+
+		pois_ngrid = 1;
+		int np = (grid.mpi_nx - 2 * grid.gcx) * (grid.mpi_ny - 2 * grid.gcy) * (grid.mpi_nz - 2 * grid.gcz);
+		while (np > model_const::poisson::mg::auto_min) { pois_ngrid++; np /= 8; }
+	}
+
+	mg_data.init(grid, pois_ngrid,
+		mg_opt.down_iters, mg_opt.up_iters,
+		mg_opt.direct_iters,
+		mg_opt.smooth_up_omega, mg_opt.smooth_up_omega_fine,
+		pois_opt.bc_type);
+
+
+	allocation_status.poisson = true;
+	allocation_status.multigrid = true;
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_time_series()
+{
+#ifdef STRATIFICATION
+	nse_series.set(6);
+#else
+	nse_series.set(5);
+#endif
+	nse_series.name_variable(0, "divergence [c-norm]");
+	nse_series.name_variable(1, "divergence [l-norm]");
+	nse_series.name_variable(2, "U[RMS]");
+	nse_series.name_variable(3, "V[RMS]");
+	nse_series.name_variable(4, "W[RMS]");
+#ifdef STRATIFICATION
+	nse_series.name_variable(5, "T[RMS]");
+#endif
+
+	bool status;
+	if (grid.mpi_com.rank == 0)
+	{
+		if (!dump.restart) {
+			status = nse_series.init(output.NSE_SEQ_FILE);
+		}
+		else
+		{
+			status = copy_file(dump.NSE_SEQ_FILE, dump.restart_index, output.NSE_SEQ_FILE);
+			if (status)
+				status = nse_series.init_append(output.NSE_SEQ_FILE);
+			else
+			{
+				printf("\n >> WARNING! >> ** failed to copy dump series: ""%s""\n",
+					dump.NSE_SEQ_FILE.c_str());
+				printf("\t >> -- setting new time series data\n\n");
+				status = nse_series.init(output.NSE_SEQ_FILE);
+			}
+		}
+	}
+	mpi_broadcast(&status, 1, 0, grid.mpi_com.comm);
+
+	if (!status) {
+		if (grid.mpi_com.rank == 0)
+			printf("\n >> FAILURE! >> ** init time series file: ""%s"" **\n\n",
+				output.NSE_SEQ_FILE.c_str());
+	}
+
+	return status;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_statistics_block()
+{
+#ifndef RESTRICT_STATS_DUMP
+	if ((dump.restart) && (current_time >= stats.begin))
+	{
+		stats.read_dump(dump.restart_index, grid);
+	}
+	else
+#endif
+	{
+		stats.init(grid);
+	}
+
+	avg.init(grid);
+	space_avg.init(grid);
+
+#ifdef FOURIER_SPECTRUM
+	spectrum.init(grid);
+#endif
+
+	return true;
+}
+// ------------------------------------------------------------------------------------------------ //
+
+#ifdef INCLUDE_PARTICLES
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_particles()
+{
+	ptcl_opt.n = model_default::ptcl_opt::n;
+	ptcl_opt.begin = model_default::ptcl_opt::begin;
+
+	if (use_config) {
+		config.mpi_get_value("ptcl_opt.n", &ptcl_opt.n, grid.mpi_com.comm);
+		config.mpi_get_value("ptcl_opt.begin", &ptcl_opt.begin, grid.mpi_com.comm);
+	}
+
+
+	ptcl_opt.is_released = (current_time - dt >= ptcl_opt.begin);
+
+	return true;
+}
+// ------------------------------------------------------------------------------------------------ //
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_track_particles()
+{
+	ptcl_track_opt.n = model_default::ptcl_track_opt::n;
+	ptcl_track_opt.begin = model_default::ptcl_track_opt::begin;
+
+	ptcl_track_opt.group_max_size = model_default::ptcl_track_opt::group_max_size;
+	ptcl_track_opt.max_memory = model_default::ptcl_track_opt::max_memory;
+
+	if (use_config) {
+		config.mpi_get_value("ptcl_track_opt.n", &ptcl_track_opt.n, grid.mpi_com.comm);
+		config.mpi_get_value("ptcl_track_opt.begin", &ptcl_track_opt.begin, grid.mpi_com.comm);
+
+		config.mpi_get_value("ptcl_track_opt.group_max_size",
+			&ptcl_track_opt.group_max_size, grid.mpi_com.comm);
+		config.mpi_get_value("ptcl_track_opt.max_memory",
+			&ptcl_track_opt.max_memory, grid.mpi_com.comm);
+	}
+
+	ptcl_track_opt.is_released = (current_time - dt >= ptcl_track_opt.begin);
+
+	traj.set_group_max_size(ptcl_track_opt.group_max_size);
+	traj.set_max_memory_usage(ptcl_track_opt.max_memory);
+
+	return true;
+}
+// ------------------------------------------------------------------------------------------------ //
+#endif
+
+#ifdef INCLUDE_VISUALRTL
+template< typename T, nse::memType mem >
+bool nse::modelObj<T, mem>::set_visualization()
+{
+	visual.begin = model_default::visual::begin;
+	visual.end = model_default::visual::end;
+	visual.dt = model_default::visual::dt;
+
+	//if (use_config) {
+	//	config.mpi_get_value("visual.begin", &visual.begin, grid.mpi_com.comm);
+	//	config.mpi_get_value("visual.end", &visual.end, grid.mpi_com.comm);
+	//	config.mpi_get_value("visual.dt", &visual.dt, grid.mpi_com.comm);
+	//}
+
+	visual.mark = visual.begin;
+	visual.index = 1;
+	if (dump.restart) {
+		while ((current_time >= visual.mark) &&
+			(visual.mark <= visual.end))
+		{
+			visual.mark += visual.dt;
+			visual.index++;
+		}
+	}
+
+	visual.xmin = model_default::visual::xmin;
+	visual.xmax = model_default::visual::xmax;
+	visual.ymin = model_default::visual::ymin;
+	visual.ymax = model_default::visual::ymax;
+	visual.zmin = model_default::visual::zmin;
+	visual.zmax = model_default::visual::zmax;
+
+	visual.png_resolution = model_default::visual::png_resolution;
+
+	//if (use_config) {
+	//	config.mpi_get_value("visual.xmin", &visual.xmin, grid.mpi_com.comm);
+	//	config.mpi_get_value("visual.xmax", &visual.xmax, grid.mpi_com.comm);
+	//	config.mpi_get_value("visual.ymin", &visual.ymin, grid.mpi_com.comm);
+	//	config.mpi_get_value("visual.ymax", &visual.ymax, grid.mpi_com.comm);
+	//	config.mpi_get_value("visual.zmin", &visual.zmin, grid.mpi_com.comm);
+	//	config.mpi_get_value("visual.zmax", &visual.zmax, grid.mpi_com.comm);
+	//
+	//	config.mpi_get_value("visual.png_resolution", &visual.png_resolution, grid.mpi_com.comm);
+	//}
+
+
+	return true;
+}
+// ------------------------------------------------------------------------------------------------ //
+#endif
+
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::mpi_cleanup()
+{
+	if (allocation_status.multigrid) {
+		mg_data.clear();
+		allocation_status.multigrid = false;
+	}
+
+	if (allocation_status.grid) {
+		grid.mpi_com.cleanup();
+		allocation_status.grid = false;
+	}
+}
+
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::clear()
+{
+#ifdef INCLUDE_PARTICLES_TRACKING
+	// cleaning trajectory data, working files & directories
+	traj.cleanup(grid);
+#endif
+
+	mpi_cleanup();
+
+	if (allocation_status.base_fields)
+	{
+		deallocate(U, V, W);
+		deallocate(Pressure);
+
+		deallocate(Uim, Vim, Wim);
+		deallocate(Uim_p, Vim_p, Wim_p);
+#if (AB_TYPE == 3)
+		deallocate(Uim_pp, Vim_pp, Wim_pp);
+#endif
+		deallocate(Phi); deallocate(Rhs);
+		deallocate(Divergence);
+
+#ifdef STRATIFICATION
+		deallocate(Tx); deallocate(Tsh); deallocate(Tline);
+		deallocate(Tim); deallocate(Tim_p);
+#if ( AB_TYPE == 3 )
+		deallocate(Tim_pp);
+#endif
+#endif
+		allocation_status.base_fields = false;
+	}
+
+	if (allocation_status.ext_fields)
+	{
+		deallocate(U2_u, V2_v, W2_w);
+		deallocate(U2_uw, V2_vw, W2_c);
+
+		deallocate(W2_u, W2_v);
+		deallocate(W2_uw, W2_vw);
+
+		deallocate(UV, UW, VW);
+		deallocate(UV_uvw, UW_uvw, VW_uvw);
+		deallocate(PU, PV, PW);
+
+		deallocate(UW_bottom, UW_top);
+		deallocate(VW_bottom, VW_top);
+		deallocate(UW_bottom_uv, UW_top_uv);
+		deallocate(VW_bottom_uv, VW_top_uv);
+		deallocate(UW_bottom_uw, UW_top_uw);
+		deallocate(VW_bottom_vw, VW_top_vw);
+		deallocate(UW_adv, VW_adv);
+
+		deallocate(U2W, V2W, W2W);
+		deallocate(UVW, UWW, VWW);
+
+		deallocate(U_diff, V_diff, W_diff);
+		deallocate(U_diss, V_diss, W_diss);
+		deallocate(UV_diss, UW_diss, VW_diss);
+
+		deallocate(U_iso_diss, V_iso_diss, W_iso_diss);
+		deallocate(UV_iso_diss, UW_iso_diss, VW_iso_diss);
+		deallocate(iso_diss_x, iso_diss_y, iso_diss_z);
+
+		deallocate(PSuu, PSvv, PSww);
+		deallocate(P2Suv, P2Suw, P2Svw);
+
+
+#ifdef STRATIFICATION
+		deallocate(T2_c, T2_w);
+
+		deallocate(TU, TV, TW);
+		deallocate(TU_uw, TV_vw);
+		deallocate(TW_uw, TW_vw);
+		deallocate(TP);
+
+		deallocate(TW_bottom, TW_top);
+		deallocate(
+			TW_bottom_u, TW_top_u,
+			TW_bottom_v, TW_top_v,
+			TW_bottom_w, TW_top_w);
+		deallocate(TW_adv);
+
+		deallocate(T2W);
+		deallocate(TUW, TVW, TWW);
+
+		deallocate(T_diff);
+		deallocate(T_diss);
+		deallocate(TU_diss, TV_diss, TW_diss);
+
+		deallocate(T_iso_diss);
+
+		deallocate(T_dPdx, T_dPdy, T_dPdz);
+#endif
+
+		allocation_status.ext_fields = false;
+	}
+
+	if (allocation_status.poisson)
+	{
+		for (int k = 0; k < 6; k++)
+			deallocate(memory[k]);
+
+		allocation_status.poisson = false;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/model-stats.hpp b/model-stats.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ed5f14b14b362d0ebb674df2217f0326b008063
--- /dev/null
+++ b/model-stats.hpp
@@ -0,0 +1,2442 @@
+#include "model-obj.h"
+#include "model-bc.h"
+
+#include "nse3d.h"
+#include "nse-fops3d-x2.h"
+#ifdef SCHEME_X4
+#include "nse3d-x4.h"
+#include "nse-fops3d-x4.h"
+#endif
+#include "nse-io3d.h"
+
+
+#include "bl-turb.h"
+#include "bl-turb-scalar.h"
+#include "bl-flux-def.h"
+#include "bl-scale-def.h"
+
+#ifdef FOURIER_SPECTRUM
+#include "my_fft.h"
+#endif
+
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T, nse::memType mem >
+T nse::modelObj<T, mem>::dynamic_velocity()
+{
+	if (stats.U.slice_num != 0)
+		stats.U.average(avg.U);
+	else
+		grid.template average_by_xy<nodeU>(avg.U, U);
+
+	return ::dynamic_velocity(avg.U, fluid.Umax, fluid.kinematic_viscosity, grid);
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::gather_nse_eq_statistics(
+	T *U_rms_max, T *V_rms_max, T *W_rms_max)
+{
+	const int stat_mode = (current_time >= stats.begin) &&
+		((stats.time_index % stats.time_mod) == 0);
+
+
+	// U-U^2 slices ...
+	grid.template average_by_xy<nodeU>(space_avg.U, U);				// --> [C](z) node
+	if (stat_mode) stats.U.push(space_avg.U, current_time);
+
+	u_square(U2_u, U, nodeU, grid);
+	grid.template average_by_xy<nodeU>(space_avg.U2_u, U2_u);		// --> [C](z) node
+	if (stat_mode) stats.U2_u.push(space_avg.U2_u, current_time);
+
+	// V-V^2 slices ...
+	grid.template average_by_xy<nodeV>(space_avg.V, V);				// --> [C](z) node
+	if (stat_mode) stats.V.push(space_avg.V, current_time);
+
+	v_square(V2_v, V, nodeV, grid);
+	grid.template average_by_xy<nodeV>(space_avg.V2_v, V2_v);		// --> [C](z) node
+	if (stat_mode) stats.V2_v.push(space_avg.V2_v, current_time);
+
+	// W-W^2 slices ...
+	grid.template average_by_xy<nodeW>(space_avg.W, W);				// --> [W](z) node
+	if (stat_mode) stats.W.push(space_avg.W, current_time);
+
+	w_square(W2_w, W, nodeW, grid);
+	grid.template average_by_xy<nodeW>(space_avg.W2_w, W2_w);		// --> [W](z) node
+	if (stat_mode) stats.W2_w.push(space_avg.W2_w, current_time);
+
+	// RMS ...
+	(*U_rms_max) = (T)mpi_max_deviation(space_avg.U, space_avg.U2_u,
+		grid.nz, grid.mpi_com.comm);
+	(*V_rms_max) = (T)mpi_max_deviation(space_avg.V, space_avg.V2_v,
+		grid.nz, grid.mpi_com.comm); // k = [gcz ... nz - gcz - 1]
+	(*W_rms_max) = (T)mpi_max_deviation(space_avg.W, space_avg.W2_w,
+		grid.nz, grid.mpi_com.comm); // k = [gcz ... nz - gcz] *: ok if (W = 0) at walls
+
+
+	if (stat_mode)	// gather statistics
+	{
+		// - Pressure ...
+		// ---------------------------------------------------------------------------------- //
+		pressure_bc_halo(Pressure, pois_opt, grid);		// pressure full halo bc's
+
+		grid.template average_by_xy<nodeC>(space_avg.P, Pressure);		// --> [C] node
+		stats.P.push(space_avg.P, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+		// - [Ui * Uj], [Ui * P], [Ui * d(Uj)/dz] ... 
+		// ---------------------------------------------------------------------------------- //
+
+		// --- Ui^2 with node shifts ...
+		u_square(U2_uw, U, nodeUW, grid);
+		grid.template average_by_xy<nodeUW>(space_avg.U2_uw, U2_uw);		// --> [W] node
+		stats.U2_uw.push(space_avg.U2_uw, current_time);
+
+		v_square(V2_vw, V, nodeVW, grid);
+		grid.template average_by_xy<nodeVW>(space_avg.V2_vw, V2_vw);		// --> [W] node
+		stats.V2_vw.push(space_avg.V2_vw, current_time);
+
+		w_square(W2_c, W, nodeC, grid);
+		grid.template average_by_xy<nodeC>(space_avg.W2_c, W2_c);		// --> [C] node
+		stats.W2_c.push(space_avg.W2_c, current_time);
+
+		// --- W^2 with node shifts (in: turbulence transport) ...
+#ifdef SCHEME_X4
+		w_square_x4(W2_u, W, nodeU, grid);
+		w_square_x4(W2_v, W, nodeV, grid);
+		w_square_x4(W2_uw, W, nodeUW, grid);
+		w_square_x4(W2_vw, W, nodeVW, grid);
+#else
+		w_square(W2_u, W, nodeU, grid);
+		w_square(W2_v, W, nodeV, grid);
+		w_square(W2_uw, W, nodeUW, grid);
+		w_square(W2_vw, W, nodeVW, grid);
+#endif
+		grid.template average_by_xy<nodeU>(space_avg.W2_u, W2_u);			// --> [C] node
+		grid.template average_by_xy<nodeV>(space_avg.W2_v, W2_v);			// --> [C] node
+		grid.template average_by_xy<nodeUW>(space_avg.W2_uw, W2_uw);			// --> [W] node
+		grid.template average_by_xy<nodeVW>(space_avg.W2_vw, W2_vw);			// --> [W] node
+
+		stats.W2_u.push(space_avg.W2_u, current_time);
+		stats.W2_v.push(space_avg.W2_v, current_time);
+		stats.W2_uw.push(space_avg.W2_uw, current_time);
+		stats.W2_vw.push(space_avg.W2_vw, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- mixed products ...
+#ifdef SCHEME_X4
+		uv_product_x4(UV, U, V, nodeUV, grid);
+#else
+		uv_product(UV, U, V, nodeUV, grid);
+#endif
+		grid.template average_by_xy<nodeUV>(space_avg.UV, UV);				// --> [C] node
+		stats.UV.push(space_avg.UV, current_time);
+
+#ifdef SCHEME_X4
+		uw_product_x4(UW, U, W, nodeUW, grid);
+#else
+		uw_product(UW, U, W, nodeUW, grid);
+#endif
+		grid.template average_by_xy<nodeUW>(space_avg.UW, UW);				// --> [W] node
+		stats.UW.push(space_avg.UW, current_time);
+
+#ifdef SCHEME_X4
+		vw_product_x4(VW, V, W, nodeVW, grid);
+#else
+		vw_product(VW, V, W, nodeVW, grid);
+#endif
+		grid.template average_by_xy<nodeVW>(space_avg.VW, VW);				// --> [W] node
+		stats.VW.push(space_avg.VW, current_time);
+
+#ifdef SCHEME_X4
+		uv_product_x4(UV_uvw, U, V, nodeUVW, grid);
+		uw_product_x4(UW_uvw, U, W, nodeUVW, grid);
+		vw_product_x4(VW_uvw, V, W, nodeUVW, grid);
+#else
+		uv_product(UV_uvw, U, V, nodeUVW, grid);
+		uw_product(UW_uvw, U, W, nodeUVW, grid);
+		vw_product(VW_uvw, V, W, nodeUVW, grid);
+#endif
+		grid.template average_by_xy<nodeUVW>(space_avg.UV_uvw, UV_uvw);		// --> [W] node
+		grid.template average_by_xy<nodeUVW>(space_avg.UW_uvw, UW_uvw);		// --> [W] node
+		grid.template average_by_xy<nodeUVW>(space_avg.VW_uvw, VW_uvw);		// --> [W] node
+
+		stats.UV_uvw.push(space_avg.UV_uvw, current_time);
+		stats.UW_uvw.push(space_avg.UW_uvw, current_time);
+		stats.VW_uvw.push(space_avg.VW_uvw, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- P * Ui ...
+#ifdef SCHEME_X4
+		cu_product_x4(PU, Pressure, U, nodeU, grid);
+		cv_product_x4(PV, Pressure, V, nodeV, grid);
+#else
+		cu_product(PU, Pressure, U, nodeU, grid);
+		cv_product(PV, Pressure, V, nodeV, grid);
+#endif
+		cw_product(PW, Pressure, W, nodeW, grid);
+
+		grid.template average_by_xy<nodeU>(space_avg.PU, PU);				// --> [C] node
+		grid.template average_by_xy<nodeV>(space_avg.PV, PV);				// --> [C] node
+		grid.template average_by_xy<nodeW>(space_avg.PW, PW);				// --> [W] node
+
+		stats.PU.push(space_avg.PU, current_time);
+		stats.PV.push(space_avg.PV, current_time);
+		stats.PW.push(space_avg.PW, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- special Ui * W products (in turbulence production)
+#ifdef SCHEME_X4
+		uw_product_partition_x4(UW_bottom, UW_top, U, W, nodeU, grid);
+		vw_product_partition_x4(VW_bottom, VW_top, V, W, nodeV, grid);
+
+		uw_product_partition_x4(UW_bottom_uv, UW_top_uv, U, W, nodeUV, grid);
+		vw_product_partition_x4(VW_bottom_uv, VW_top_uv, V, W, nodeUV, grid);
+
+		uw_product_partition_x4(UW_bottom_uw, UW_top_uw, U, W, nodeUW, grid);
+		vw_product_partition_x4(VW_bottom_vw, VW_top_vw, V, W, nodeVW, grid);
+#else
+		uw_product_partition(UW_bottom, UW_top, U, W, nodeU, grid);
+		vw_product_partition(VW_bottom, VW_top, V, W, nodeV, grid);
+
+		uw_product_partition(UW_bottom_uv, UW_top_uv, U, W, nodeUV, grid);
+		vw_product_partition(VW_bottom_uv, VW_top_uv, V, W, nodeUV, grid);
+
+		uw_product_partition(UW_bottom_uw, UW_top_uw, U, W, nodeUW, grid);
+		vw_product_partition(VW_bottom_vw, VW_top_vw, V, W, nodeVW, grid);
+#endif
+		grid.template average_by_xy<nodeU>(space_avg.UW_bottom, UW_bottom);			// --> ~(between)[C,W] nodes
+		grid.template average_by_xy<nodeV>(space_avg.VW_bottom, VW_bottom);			// --> ~(between)[C,W] nodes
+		grid.template average_by_xy<nodeU>(space_avg.UW_top, UW_top);				// --> ~(between)[C,W] nodes
+		grid.template average_by_xy<nodeV>(space_avg.VW_top, VW_top);				// --> ~(between)[C,W] nodes
+
+		grid.template average_by_xy<nodeUV>(space_avg.UW_bottom_uv, UW_bottom_uv);		// --> ~(between)[C,W] nodes
+		grid.template average_by_xy<nodeUV>(space_avg.VW_bottom_uv, VW_bottom_uv);		// --> ~(between)[C,W] nodes
+		grid.template average_by_xy<nodeUV>(space_avg.UW_top_uv, UW_top_uv);			// --> ~(between)[C,W] nodes
+		grid.template average_by_xy<nodeUV>(space_avg.VW_top_uv, VW_top_uv);			// --> ~(between)[C,W] nodes
+
+		grid.template average_by_xy<nodeUW>(space_avg.UW_bottom_uw, UW_bottom_uw);		// --> ~(between)[W,C] nodes
+		grid.template average_by_xy<nodeVW>(space_avg.VW_bottom_vw, VW_bottom_vw);		// --> ~(between)[W,C] nodes
+		grid.template average_by_xy<nodeUW>(space_avg.UW_top_uw, UW_top_uw);			// --> ~(between)[W,C] nodes
+		grid.template average_by_xy<nodeVW>(space_avg.VW_top_vw, VW_top_vw);			// --> ~(between)[W,C] nodes
+
+		stats.UW_bottom.push(space_avg.UW_bottom, current_time);
+		stats.VW_bottom.push(space_avg.VW_bottom, current_time);
+		stats.UW_top.push(space_avg.UW_top, current_time);
+		stats.VW_top.push(space_avg.VW_top, current_time);
+
+		stats.UW_bottom_uv.push(space_avg.UW_bottom_uv, current_time);
+		stats.VW_bottom_uv.push(space_avg.VW_bottom_uv, current_time);
+		stats.UW_top_uv.push(space_avg.UW_top_uv, current_time);
+		stats.VW_top_uv.push(space_avg.VW_top_uv, current_time);
+
+		stats.UW_bottom_uw.push(space_avg.UW_bottom_uw, current_time);
+		stats.VW_bottom_vw.push(space_avg.VW_bottom_vw, current_time);
+		stats.UW_top_uw.push(space_avg.UW_top_uw, current_time);
+		stats.VW_top_vw.push(space_avg.VW_top_vw, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- W * (dU/dz) and W * (dV/dz) (in turbulence transport)
+#ifdef SCHEME_X4
+		uw_advection_x4(UW_adv, U, W, nodeUW, grid);
+#else
+		uw_advection(UW_adv, U, W, nodeUW, grid);
+#endif
+		grid.template average_by_xy<nodeUW>(space_avg.UW_adv, UW_adv);			// --> [W] node
+		stats.UW_adv.push(space_avg.UW_adv, current_time);
+
+#ifdef SCHEME_X4
+		vw_advection_x4(VW_adv, V, W, nodeVW, grid);
+#else
+		vw_advection(VW_adv, V, W, nodeVW, grid);
+#endif
+		grid.template average_by_xy<nodeVW>(space_avg.VW_adv, VW_adv);			// --> [W] node
+		stats.VW_adv.push(space_avg.VW_adv, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+		// - [Ui * Uj * Uk] ... 
+		// ---------------------------------------------------------------------------------- //
+
+		// --- Ui * Ui * W products ...
+#ifdef SCHEME_X4
+		u2w_product_x4(U2W, U, W, nodeUW, grid);
+		v2w_product_x4(V2W, V, W, nodeVW, grid);
+#else
+		u2w_product(U2W, U, W, nodeUW, grid);
+		v2w_product(V2W, V, W, nodeVW, grid);
+#endif
+		w2w_product(W2W, W, nodeC, grid);
+
+		grid.template average_by_xy<nodeUW>(space_avg.U2W, U2W);			// --> [W] node
+		grid.template average_by_xy<nodeVW>(space_avg.V2W, V2W);			// --> [W] node
+		grid.template average_by_xy<nodeC>(space_avg.W2W, W2W);				// --> [C] node
+
+		stats.U2W.push(space_avg.U2W, current_time);
+		stats.V2W.push(space_avg.V2W, current_time);
+		stats.W2W.push(space_avg.W2W, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- Ui * Uj * W products ...
+#ifdef SCHEME_X4
+		uvw_product_x4(UVW, U, V, W, nodeUVW, grid);
+		uww_product_x4(UWW, U, W, nodeU, grid);
+		vww_product_x4(VWW, V, W, nodeV, grid);
+#else
+		uvw_product(UVW, U, V, W, nodeUVW, grid);
+		uww_product(UWW, U, W, nodeU, grid);
+		vww_product(VWW, V, W, nodeV, grid);
+#endif
+		grid.template average_by_xy<nodeUVW>(space_avg.UVW, UVW);			// --> [W] node
+		grid.template average_by_xy<nodeU>(space_avg.UWW, UWW);				// --> [C] node
+		grid.template average_by_xy<nodeV>(space_avg.VWW, VWW);				// --> [C] node
+		stats.UVW.push(space_avg.UVW, current_time);
+		stats.UWW.push(space_avg.UWW, current_time);
+		stats.VWW.push(space_avg.VWW, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+		// - Dissipation ...
+		// ---------------------------------------------------------------------------------- //
+
+		// --- laplace(U[i]) ...
+#ifdef SCHEME_X4
+		u_set_diffusion_x4(U_diff, U, fluid.kinematic_viscosity, grid);
+		v_set_diffusion_x4(V_diff, V, fluid.kinematic_viscosity, grid);
+		w_set_diffusion_x4(W_diff, W, fluid.kinematic_viscosity, grid);
+#else
+		u_set_diffusion(U_diff, U, fluid.kinematic_viscosity, grid);
+		v_set_diffusion(V_diff, V, fluid.kinematic_viscosity, grid);
+		w_set_diffusion(W_diff, W, fluid.kinematic_viscosity, grid);
+#endif
+		diffusion_exch(U_diff, V_diff, W_diff, grid);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- U[i] * laplace(U[i]) ...
+		uu_product(U_diss, U, U_diff, nodeU, grid);
+#ifdef SCHEME_X4
+		u_iso_dissipation_x4(U_iso_diss, U, fluid.kinematic_viscosity, grid);
+#else
+		u_iso_dissipation(U_iso_diss, U, fluid.kinematic_viscosity, grid);
+#endif
+		grid.template average_by_xy<nodeU>(space_avg.U_diss, U_diss);			// --> [C] node
+		grid.template average_by_xy<nodeU>(space_avg.U_iso_diss, U_iso_diss);	// --> [C] node
+		stats.U_diss.push(space_avg.U_diss, current_time);
+		stats.U_iso_diss.push(space_avg.U_iso_diss, current_time);
+
+		vv_product(V_diss, V, V_diff, nodeV, grid);
+#ifdef SCHEME_X4
+		v_iso_dissipation_x4(V_iso_diss, V, fluid.kinematic_viscosity, grid);
+#else
+		v_iso_dissipation(V_iso_diss, V, fluid.kinematic_viscosity, grid);
+#endif
+		grid.template average_by_xy<nodeV>(space_avg.V_diss, V_diss);			// --> [C] node
+		grid.template average_by_xy<nodeV>(space_avg.V_iso_diss, V_iso_diss);	// --> [C] node
+		stats.V_diss.push(space_avg.V_diss, current_time);
+		stats.V_iso_diss.push(space_avg.V_iso_diss, current_time);
+
+		ww_product(W_diss, W, W_diff, nodeW, grid);
+#ifdef SCHEME_X4
+		w_iso_dissipation_x4(W_iso_diss, W, fluid.kinematic_viscosity, grid);
+#else
+		w_iso_dissipation(W_iso_diss, W, fluid.kinematic_viscosity, grid);
+#endif
+		grid.template average_by_xy<nodeW>(space_avg.W_diss, W_diss);			// --> [W] node
+		grid.template average_by_xy<nodeW>(space_avg.W_iso_diss, W_iso_diss);	// --> [W] node
+		stats.W_diss.push(space_avg.W_diss, current_time);
+		stats.W_iso_diss.push(space_avg.W_iso_diss, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- U[j] * laplace(U[i]) + U[i] * laplace(U[j]) ...
+#ifdef SCHEME_X4
+		uv_dissipation_x4(UV_diss, U, V, U_diff, V_diff, grid);
+		uw_dissipation_x4(UW_diss, U, W, U_diff, W_diff, grid);
+		vw_dissipation_x4(VW_diss, V, W, V_diff, W_diff, grid);
+#else
+		uv_dissipation(UV_diss, U, V, U_diff, V_diff, grid);
+		uw_dissipation(UW_diss, U, W, U_diff, W_diff, grid);
+		vw_dissipation(VW_diss, V, W, V_diff, W_diff, grid);
+#endif
+		grid.template average_by_xy<nodeUV>(space_avg.UV_diss, UV_diss);			// --> [C] node
+		grid.template average_by_xy<nodeUW>(space_avg.UW_diss, UW_diss);			// --> [W] node
+		grid.template average_by_xy<nodeVW>(space_avg.VW_diss, VW_diss);			// --> [W] node
+		stats.UV_diss.push(space_avg.UV_diss, current_time);
+		stats.UW_diss.push(space_avg.UW_diss, current_time);
+		stats.VW_diss.push(space_avg.VW_diss, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- Ui * Uj iso dissipation partition ...
+		//	--- [UV] ...
+#ifdef SCHEME_X4
+		uv_iso_dissipation_components_x4(iso_diss_x, iso_diss_y, iso_diss_z,
+			U, V, fluid.kinematic_viscosity, grid);
+#else
+		uv_iso_dissipation_components(iso_diss_x, iso_diss_y, iso_diss_z,
+			U, V, fluid.kinematic_viscosity, grid);
+#endif
+		iso_dissipation_exch(iso_diss_x, iso_diss_y, iso_diss_z, grid);
+#ifdef SCHEME_X4
+		uv_iso_dissipation_x4(UV_iso_diss,
+			iso_diss_x, iso_diss_y, iso_diss_z, grid);
+#else
+		uv_iso_dissipation(UV_iso_diss,
+			iso_diss_x, iso_diss_y, iso_diss_z, grid);
+#endif
+		//	--- [UW] ...
+#ifdef SCHEME_X4
+		uw_iso_dissipation_components_x4(iso_diss_x, iso_diss_y, iso_diss_z,
+			U, W, fluid.kinematic_viscosity, grid);
+#else
+		uw_iso_dissipation_components(iso_diss_x, iso_diss_y, iso_diss_z,
+			U, W, fluid.kinematic_viscosity, grid);
+#endif
+		iso_dissipation_exch(iso_diss_x, iso_diss_y, iso_diss_z, grid);
+#ifdef SCHEME_X4
+		uw_iso_dissipation_x4(UW_iso_diss,
+			iso_diss_x, iso_diss_y, iso_diss_z, grid);
+#else
+		uw_iso_dissipation(UW_iso_diss,
+			iso_diss_x, iso_diss_y, iso_diss_z, grid);
+#endif
+		//	--- [VW] ...
+#ifdef SCHEME_X4
+		vw_iso_dissipation_components_x4(iso_diss_x, iso_diss_y, iso_diss_z,
+			V, W, fluid.kinematic_viscosity, grid);
+#else
+		vw_iso_dissipation_components(iso_diss_x, iso_diss_y, iso_diss_z,
+			V, W, fluid.kinematic_viscosity, grid);
+#endif
+		iso_dissipation_exch(iso_diss_x, iso_diss_y, iso_diss_z, grid);
+#ifdef SCHEME_X4
+		vw_iso_dissipation_x4(VW_iso_diss,
+			iso_diss_x, iso_diss_y, iso_diss_z, grid);
+#else
+		vw_iso_dissipation(VW_iso_diss,
+			iso_diss_x, iso_diss_y, iso_diss_z, grid);
+#endif
+
+		grid.template average_by_xy<nodeUV>(space_avg.UV_iso_diss, UV_iso_diss);		// --> [C] node
+		grid.template average_by_xy<nodeUW>(space_avg.UW_iso_diss, UW_iso_diss);		// --> [W] node
+		grid.template average_by_xy<nodeVW>(space_avg.VW_iso_diss, VW_iso_diss);		// --> [W] node
+		stats.UV_iso_diss.push(space_avg.UV_iso_diss, current_time);
+		stats.UW_iso_diss.push(space_avg.UW_iso_diss, current_time);
+		stats.VW_iso_diss.push(space_avg.VW_iso_diss, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+		// - Pressure * Strain ...
+		// ---------------------------------------------------------------------------------- //
+
+#ifdef SCHEME_X4
+		pressure_strain_diag_x4(PSuu, PSvv, PSww,
+			Pressure, U, V, W, grid);
+#else
+		pressure_strain_diag(PSuu, PSvv, PSww,
+			Pressure, U, V, W, grid);
+#endif
+		grid.template average_by_xy<nodeC>(space_avg.PSuu, PSuu);		// --> [C] node
+		grid.template average_by_xy<nodeC>(space_avg.PSvv, PSvv);		// --> [C] node
+		grid.template average_by_xy<nodeC>(space_avg.PSww, PSww);		// --> [C] node
+		stats.PSuu.push(space_avg.PSuu, current_time);
+		stats.PSvv.push(space_avg.PSvv, current_time);
+		stats.PSww.push(space_avg.PSww, current_time);
+
+#ifdef SCHEME_X4
+		pressure_strain_uv_x4(P2Suv, Pressure, U, V, grid);
+		pressure_strain_uw_x4(P2Suw, Pressure, U, W, grid);
+		pressure_strain_vw_x4(P2Svw, Pressure, V, W, grid);
+#else
+		pressure_strain_uv(P2Suv, Pressure, U, V, grid);
+		pressure_strain_uw(P2Suw, Pressure, U, W, grid);
+		pressure_strain_vw(P2Svw, Pressure, V, W, grid);
+#endif
+		grid.template average_by_xy< nodeUV >(space_avg.P2Suv, P2Suv);
+		grid.template average_by_xy< nodeUW >(space_avg.P2Suw, P2Suw);
+		grid.template average_by_xy< nodeVW >(space_avg.P2Svw, P2Svw);
+		stats.P2Suv.push(space_avg.P2Suv, current_time);
+		stats.P2Suw.push(space_avg.P2Suw, current_time);
+		stats.P2Svw.push(space_avg.P2Svw, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+#ifdef COMPUTE_XT_AVERAGES
+		// - 2D averages ...
+		// ---------------------------------------------------------------------------------- //
+
+		// --- Ui ...
+		grid.template average_by_x<nodeU>(space_avg.Uyz, U);			// --> [C] node	
+		stats.Uyz.push(space_avg.Uyz, current_time);
+
+		grid.template average_by_x<nodeV>(space_avg.Vyz, V);			// --> [V] node
+		stats.Vyz.push(space_avg.Vyz, current_time);
+
+		grid.template average_by_x<nodeW>(space_avg.Wyz, W);			// --> [W] node
+		stats.Wyz.push(space_avg.Wyz, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- Ui * Ui ...
+		grid.template average_by_x<nodeU>(space_avg.U2yz, U2_u);		// --> [C] node
+		stats.U2yz.push(space_avg.U2yz, current_time);
+
+		grid.template average_by_x<nodeV>(space_avg.V2yz, V2_v);		// --> [V] node
+		stats.V2yz.push(space_avg.V2yz, current_time);
+
+		grid.template average_by_x<nodeW>(space_avg.W2yz, W2_w);		// --> [W] node
+		stats.W2yz.push(space_avg.W2yz, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- Ui * Uj ...
+		grid.template average_by_x<nodeUV>(space_avg.UVyz, UV);		// --> [V] node
+		stats.UVyz.push(space_avg.UVyz, current_time);
+
+		grid.template average_by_x<nodeUW>(space_avg.UWyz, UW);		// --> [W] node
+		stats.UWyz.push(space_avg.UWyz, current_time);
+
+		grid.template average_by_x<nodeVW>(space_avg.VWyz, VW);		// --> [VW] node
+		stats.VWyz.push(space_avg.VWyz, current_time);
+		// ---------------------------------------------------------------------------------- //
+#endif
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+#ifdef STRATIFICATION
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::gather_heat_eq_statistics(
+	T *T_rms_max)
+{
+	const int stat_mode = (current_time >= stats.begin) &&
+		((stats.time_index % stats.time_mod) == 0);
+
+
+	// T-T^2 slices ...
+	grid.template average_by_xy<nodeC>(space_avg.Tc, Tx);		// --> [C](z) node
+	if (stat_mode) stats.Tc.push(space_avg.Tc, current_time);
+
+	c_square(T2_c, Tx, nodeC, grid);
+	grid.template average_by_xy<nodeC>(space_avg.T2_c, T2_c);	// --> [C](z) node
+	if (stat_mode) stats.T2_c.push(space_avg.T2_c, current_time);
+
+	// RMS ...
+	(*T_rms_max) = (T)mpi_max_deviation(space_avg.Tc, space_avg.T2_c,
+		grid.nz, grid.mpi_com.comm);
+
+	if (stat_mode)
+	{
+		// - [T] full halo exchange for calculation of:
+		//       [T * U * W], [T * V * W], [T * U [-uw node]], [T * V [-vw node]]
+		// ---------------------------------------------------------------------------------- //
+		temperature_halo_exch(Tx, grid);
+		// ---------------------------------------------------------------------------------- //
+
+		// - [T] with removed linear profile (extending precision in dissipation) ...
+		// ---------------------------------------------------------------------------------- //
+		grid.template average_by_xy<nodeC>(space_avg.Tsh, Tsh);		// --> [C] node
+		stats.Tsh.push(space_avg.Tsh, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+		// - [T^2] with node shift ...
+		// ---------------------------------------------------------------------------------- //
+		c_square(T2_w, Tx, nodeW, grid);
+		grid.template average_by_xy<nodeW>(space_avg.T2_w, T2_w);	// --> [W] node
+		stats.T2_w.push(space_avg.T2_w, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+		// - [T * Ui], [T * P], [W * dT/dz] ...
+		// ---------------------------------------------------------------------------------- //
+
+		// --- [T * Ui] ...
+#ifdef SCHEME_X4
+		cu_product_x4(TU, Tx, U, nodeU, grid);
+		cv_product_x4(TV, Tx, V, nodeV, grid);
+#else
+		cu_product(TU, Tx, U, nodeU, grid);
+		cv_product(TV, Tx, V, nodeV, grid);
+#endif
+		grid.template average_by_xy<nodeU>(space_avg.TU, TU);		// --> [C] node
+		grid.template average_by_xy<nodeV>(space_avg.TV, TV);		// --> [C] node
+		stats.TU.push(space_avg.TU, current_time);
+		stats.TV.push(space_avg.TV, current_time);
+
+		cw_product(TW, Tx, W, nodeW, grid);
+		grid.template average_by_xy<nodeW>(space_avg.TW, TW);		// --> [W] node
+		stats.TW.push(space_avg.TW, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- [T * Ui] with node shifts ...
+#ifdef SCHEME_X4
+		cu_product_x4(TU_uw, Tx, U, nodeUW, grid);
+		cv_product_x4(TV_vw, Tx, V, nodeVW, grid);
+#else
+		cu_product(TU_uw, Tx, U, nodeUW, grid);
+		cv_product(TV_vw, Tx, V, nodeVW, grid);
+#endif
+		grid.template average_by_xy<nodeUW>(space_avg.TU_uw, TU_uw);			// --> [W] node
+		grid.template average_by_xy<nodeVW>(space_avg.TV_vw, TV_vw);			// --> [W] node
+		stats.TU_uw.push(space_avg.TU_uw, current_time);
+		stats.TV_vw.push(space_avg.TV_vw, current_time);
+
+#ifdef SCHEME_X4
+		cw_product_x4(TW_uw, Tx, W, nodeUW, grid);
+		cw_product_x4(TW_vw, Tx, W, nodeVW, grid);
+#else
+		cw_product(TW_uw, Tx, W, nodeUW, grid);
+		cw_product(TW_vw, Tx, W, nodeVW, grid);
+#endif
+		grid.template average_by_xy<nodeUW>(space_avg.TW_uw, TW_uw);			// --> [W] node
+		grid.template average_by_xy<nodeVW>(space_avg.TW_vw, TW_vw);			// --> [W] node
+		stats.TW_uw.push(space_avg.TW_uw, current_time);
+		stats.TW_vw.push(space_avg.TW_vw, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- T * P ...
+		cc_product(TP, Tx, Pressure, nodeC, grid);
+		grid.template average_by_xy<nodeC>(space_avg.TP, TP);		// --> [C] node
+		stats.TP.push(space_avg.TP, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- special [T * W] approximations (in [T'^2] production)
+		cw_product_partition(TW_bottom, TW_top, Tx, W, nodeC, grid);
+
+		grid.template average_by_xy<nodeC>(space_avg.TW_bottom, TW_bottom);	// --> ~(between)[C,W] nodes
+		stats.TW_bottom.push(space_avg.TW_bottom, current_time);
+
+		grid.template average_by_xy<nodeC>(space_avg.TW_top, TW_top);		// --> ~(between)[C,W] nodes
+		stats.TW_top.push(space_avg.TW_top, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- special [T * W] approximations (in [T' * Ui'] flux production)
+#ifdef SCHEME_X4
+		cw_product_partition_x4(TW_bottom_u, TW_top_u, Tx, W, nodeU, grid);
+		cw_product_partition_x4(TW_bottom_v, TW_top_v, Tx, W, nodeV, grid);
+		cw_product_partition_x4(TW_bottom_w, TW_top_w, Tx, W, nodeW, grid);
+#else
+		cw_product_partition(TW_bottom_u, TW_top_u, Tx, W, nodeU, grid);
+		cw_product_partition(TW_bottom_v, TW_top_v, Tx, W, nodeV, grid);
+		cw_product_partition(TW_bottom_w, TW_top_w, Tx, W, nodeW, grid);
+#endif
+
+		grid.template average_by_xy<nodeU>(space_avg.TW_bottom_u, TW_bottom_u);	// --> ~(between)[C,W] nodes
+		grid.template average_by_xy<nodeV>(space_avg.TW_bottom_v, TW_bottom_v);	// --> ~(between)[C,W] nodes
+		grid.template average_by_xy<nodeW>(space_avg.TW_bottom_w, TW_bottom_w);	// --> ~(between)[W,C] nodes
+		grid.template average_by_xy<nodeU>(space_avg.TW_top_u, TW_top_u);		// --> ~(between)[C,W] nodes
+		grid.template average_by_xy<nodeV>(space_avg.TW_top_v, TW_top_v);		// --> ~(between)[C,W] nodes
+		grid.template average_by_xy<nodeW>(space_avg.TW_top_w, TW_top_w);		// --> ~(between)[W,C] nodes
+
+		stats.TW_bottom_u.push(space_avg.TW_bottom_u, current_time);
+		stats.TW_bottom_v.push(space_avg.TW_bottom_v, current_time);
+		stats.TW_bottom_w.push(space_avg.TW_bottom_w, current_time);
+		stats.TW_top_u.push(space_avg.TW_top_u, current_time);
+		stats.TW_top_v.push(space_avg.TW_top_v, current_time);
+		stats.TW_top_w.push(space_avg.TW_top_w, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- W * (dT/dz) (in [T'^2] diffusion)
+		cw_advection(TW_adv, Tx, W, nodeW, grid);
+		grid.template average_by_xy<nodeW>(space_avg.TW_adv, TW_adv);	// --> [W] node
+		stats.TW_adv.push(space_avg.TW_adv, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+		// - [T * Ui * Uj] ...
+		// ---------------------------------------------------------------------------------- //
+
+		// --- [T^2 * W] ...
+		c2w_product(T2W, Tx, W, nodeW, grid);
+		grid.template average_by_xy<nodeW>(space_avg.T2W, T2W);			// --> [W] node
+		stats.T2W.push(space_avg.T2W, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- [T * Ui * W] ...
+#ifdef SCHEME_X4
+		cuw_product_x4(TUW, Tx, U, W, nodeUW, grid);
+		cvw_product_x4(TVW, Tx, V, W, nodeVW, grid);
+#else
+		cuw_product(TUW, Tx, U, W, nodeUW, grid);
+		cvw_product(TVW, Tx, V, W, nodeVW, grid);
+#endif
+		cww_product(TWW, Tx, W, nodeC, grid);
+
+		grid.template average_by_xy<nodeUW>(space_avg.TUW, TUW);		// --> [W] node
+		grid.template average_by_xy<nodeVW>(space_avg.TVW, TVW);		// --> [W] node
+		grid.template average_by_xy<nodeC>(space_avg.TWW, TWW);			// --> [C] node
+		stats.TUW.push(space_avg.TUW, current_time);
+		stats.TVW.push(space_avg.TVW, current_time);
+		stats.TWW.push(space_avg.TWW, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+		// - Dissipation ...
+		// ---------------------------------------------------------------------------------- //
+		// note*: using T with removed linear profile
+		//
+
+		// --- laplace(T) ...
+#ifdef SCHEME_X4
+		c_set_diffusion_x4(T_diff, Tsh, fluid.diffusivity, grid);
+#else
+		c_set_diffusion(T_diff, Tsh, fluid.diffusivity, grid);
+#endif
+		diffusion_exch(T_diff, grid);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- T * laplace(T) ...
+		cc_product(T_diss, Tsh, T_diff, nodeC, grid);
+#ifdef SCHEME_X4
+		c_iso_dissipation_x4(T_iso_diss, Tsh, fluid.diffusivity, grid);
+#else
+		c_iso_dissipation(T_iso_diss, Tsh, fluid.diffusivity, grid);
+#endif
+		grid.template average_by_xy<nodeC>(space_avg.T_diss, T_diss);			// --> [C] node
+		grid.template average_by_xy<nodeC>(space_avg.T_iso_diss, T_iso_diss);	// --> [C] node
+		stats.T_diss.push(space_avg.T_diss, current_time);
+		stats.T_iso_diss.push(space_avg.T_iso_diss, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- T * laplace(U[i]) + U[i] * laplace(T) ...
+#ifdef SCHEME_X4
+		cu_dissipation_x4(TU_diss, Tsh, U, T_diff, U_diff, grid);
+		cv_dissipation_x4(TV_diss, Tsh, V, T_diff, V_diff, grid);
+		cw_dissipation_x4(TW_diss, Tsh, W, T_diff, W_diff, grid);
+#else
+		cu_dissipation(TU_diss, Tsh, U, T_diff, U_diff, grid);
+		cv_dissipation(TV_diss, Tsh, V, T_diff, V_diff, grid);
+		cw_dissipation(TW_diss, Tsh, W, T_diff, W_diff, grid);
+#endif
+		grid.template average_by_xy<nodeU>(space_avg.TU_diss, TU_diss);			// --> [C] node
+		grid.template average_by_xy<nodeV>(space_avg.TV_diss, TV_diss);			// --> [C] node
+		grid.template average_by_xy<nodeW>(space_avg.TW_diss, TW_diss);			// --> [W] node
+		stats.TU_diss.push(space_avg.TU_diss, current_time);
+		stats.TV_diss.push(space_avg.TV_diss, current_time);
+		stats.TW_diss.push(space_avg.TW_diss, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+		// --- T * grad(Pressure) ...
+		// ---------------------------------------------------------------------------------- //
+
+#ifdef SCHEME_X4
+		c_u_pressure_gradient_x4(T_dPdx, Tx, Pressure, grid);
+		c_v_pressure_gradient_x4(T_dPdy, Tx, Pressure, grid);
+		c_w_pressure_gradient_x4(T_dPdz, Tx, Pressure, grid);
+#else
+		c_u_pressure_gradient(T_dPdx, Tx, Pressure, grid);
+		c_v_pressure_gradient(T_dPdy, Tx, Pressure, grid);
+		c_w_pressure_gradient(T_dPdz, Tx, Pressure, grid);
+#endif
+		grid.template average_by_xy<nodeU>(space_avg.T_dPdx, T_dPdx);		// --> [C] node
+		grid.template average_by_xy<nodeV>(space_avg.T_dPdy, T_dPdy);		// --> [C] node
+		grid.template average_by_xy<nodeW>(space_avg.T_dPdz, T_dPdz);		// --> [W] node
+		stats.T_dPdx.push(space_avg.T_dPdx, current_time);
+		stats.T_dPdy.push(space_avg.T_dPdy, current_time);
+		stats.T_dPdz.push(space_avg.T_dPdz, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+
+#ifdef COMPUTE_XT_AVERAGES
+		// - 2D averages ...
+		// ---------------------------------------------------------------------------------- //
+
+		// --- T ...
+		grid.template average_by_x<nodeC>(space_avg.Tyz, Tx);		// --> [C] node	
+		stats.Tyz.push(space_avg.Tyz, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- T^2 ...
+		grid.template average_by_x<nodeC>(space_avg.T2yz, T2_c);	// --> [C] node	
+		stats.T2yz.push(space_avg.T2yz, current_time);
+		// ---------------------------------------------------------------------------------- //
+
+		// --- T * W ...
+		grid.template average_by_x<nodeW>(space_avg.TWyz, TW);		// --> [W] node
+		stats.TWyz.push(space_avg.TWyz, current_time);
+		// ---------------------------------------------------------------------------------- //
+#endif
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+#endif
+
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::calculate_ext_statistics(nseTurbVec< T >& avg)
+{
+
+	// - boundary conditions for -z velocity profiles ...
+	// ---------------------------------------------------------------------------------- //
+
+	// --- U, V, W, P ...
+	c_dirichlet_bc_z(avg.U,
+		-(T)0.5 * fluid.Umax, (T)0.5 * fluid.Umax, grid);
+	c_dirichlet_bc_z(avg.V, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.W, (T)0, (T)0, grid);
+	c_neumann_bc_z(avg.P, (T)0, (T)0, grid);
+
+	// --- U^2, V^2, W^2 ...
+	c_dirichlet_bc_z(avg.U2_u,
+		(T)0.25 * fluid.Umax * fluid.Umax,
+		(T)0.25 * fluid.Umax * fluid.Umax, grid);
+	c_dirichlet_bc_z(avg.V2_v, (T)0, (T)0, grid);
+	c_dirichlet_bc_z(avg.W2_c, (T)0, (T)0, grid);
+
+	w_dirichlet_bc_z(avg.U2_uw,
+		(T)0.25 * fluid.Umax * fluid.Umax,
+		(T)0.25 * fluid.Umax * fluid.Umax, grid);
+	w_dirichlet_bc_z(avg.V2_vw, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.W2_w, (T)0, (T)0, grid);
+
+	c_dirichlet_bc_z(avg.W2_u, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.W2_uw, (T)0, (T)0, grid);
+
+	c_dirichlet_bc_z(avg.W2_v, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.W2_vw, (T)0, (T)0, grid);
+
+	// --- UV, UW, VW, PW ...
+	c_dirichlet_bc_z(avg.UV, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.UW, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.VW, (T)0, (T)0, grid);
+
+	w_dirichlet_bc_z(avg.UV_uvw, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.UW_uvw, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.VW_uvw, (T)0, (T)0, grid);
+
+	w_dirichlet_bc_z(avg.PW, (T)0, (T)0, grid);
+
+	// --- W * dU/dz, W * dV/dz ...
+	w_dirichlet_bc_z(avg.UW_adv, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.VW_adv, (T)0, (T)0, grid);
+
+	// --- U^2W, V^2W, W^2W ...
+	w_dirichlet_bc_z(avg.U2W, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.V2W, (T)0, (T)0, grid);
+	c_dirichlet_bc_z(avg.W2W, (T)0, (T)0, grid);
+
+	// --- UVW, UWW, VWW ...
+	w_dirichlet_bc_z(avg.UVW, (T)0, (T)0, grid);
+	c_dirichlet_bc_z(avg.UWW, (T)0, (T)0, grid);
+	c_dirichlet_bc_z(avg.VWW, (T)0, (T)0, grid);
+
+	// --- W * laplace(W) ...
+	w_dirichlet_bc_z(avg.W_diss, (T)0, (T)0, grid);
+
+	// --- W * laplace(U[i]) + U[i] * laplace(W) ...
+	w_dirichlet_bc_z(avg.UW_diss, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.VW_diss, (T)0, (T)0, grid);
+
+	// --- grad(W)*grad(W) ...
+	w_dirichlet_bc_z(avg.W_iso_diss, (T)0, (T)0, grid);
+
+	// --- grad(U)*grad(W), grad(V)*grad(W) ...
+	w_dirichlet_bc_z(avg.UW_iso_diss, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.VW_iso_diss, (T)0, (T)0, grid);
+	// ---------------------------------------------------------------------------------- //
+
+
+#ifdef STRATIFICATION
+	// - boundary conditions for -z temperature profiles ...
+	// ---------------------------------------------------------------------------------- //
+
+	// --- T ...
+	c_dirichlet_bc_z(avg.Tc, fluid.T0, fluid.TH, grid);
+	c_dirichlet_bc_z(avg.Tsh, (T)0, (T)0, grid);
+
+	// --- T^2 ...
+	c_dirichlet_bc_z(avg.T2_c,
+		fluid.T0 * fluid.T0, fluid.TH * fluid.TH, grid);
+	w_dirichlet_bc_z(avg.T2_w,
+		fluid.T0 * fluid.T0, fluid.TH * fluid.TH, grid);
+
+	// --- TU, TV, TW ...
+	c_dirichlet_bc_z(avg.TU,
+		-(T)0.5 * fluid.T0 * fluid.Umax,
+		(T)0.5 * fluid.TH * fluid.Umax, grid);
+	c_dirichlet_bc_z(avg.TV, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.TW, (T)0, (T)0, grid);
+
+	w_dirichlet_bc_z(avg.TU_uw,
+		-(T)0.5 * fluid.T0 * fluid.Umax,
+		(T)0.5 * fluid.TH * fluid.Umax, grid);
+	w_dirichlet_bc_z(avg.TV_vw, (T)0, (T)0, grid);
+
+	w_dirichlet_bc_z(avg.TW_uw, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.TW_vw, (T)0, (T)0, grid);
+
+	// --- W * dT/dz ...
+	w_dirichlet_bc_z(avg.TW_adv, (T)0, (T)0, grid);
+
+	// --- T^2W ...
+	w_dirichlet_bc_z(avg.T2W, (T)0, (T)0, grid);
+
+	// --- TUW, TVW, TWW ...
+	w_dirichlet_bc_z(avg.TUW, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.TVW, (T)0, (T)0, grid);
+	c_dirichlet_bc_z(avg.TWW, (T)0, (T)0, grid);
+
+	// --- W * laplace(T) + T * laplace(W) ...
+	w_dirichlet_bc_z(avg.TW_diss, (T)0, (T)0, grid);
+
+	// --- T * dP/dz ...
+	w_dirichlet_bc_z(avg.T_dPdz, (T)0, (T)0, grid);
+	// ---------------------------------------------------------------------------------- //
+#endif
+
+
+	// - deviations ...
+	// ---------------------------------------------------------------------------------- //
+	deviation(avg.U_deviation, avg.U, avg.U2_u, grid.nz);
+	deviation(avg.V_deviation, avg.V, avg.V2_v, grid.nz);
+	deviation(avg.W_deviation, avg.W, avg.W2_w, grid.nz);
+
+	// - turbulent fluxes (2nd order) ...
+	// ---------------------------------------------------------------------------------- //
+	uv_flux(avg.UV_flux, avg.UV, avg.U, avg.V, axisZ, grid);
+	uw_flux(avg.UW_flux, avg.UW, avg.U, avg.W, axisZ, grid);
+	vw_flux(avg.VW_flux, avg.VW, avg.V, avg.W, axisZ, grid);
+
+	pu_flux(avg.PU_flux, avg.PU, avg.P, avg.U, grid);
+	pv_flux(avg.PV_flux, avg.PV, avg.P, avg.V, grid);
+	pw_flux(avg.PW_flux, avg.PW, avg.P, avg.W, grid);
+
+	c_dirichlet_bc_z(avg.PU_flux, (T)0, (T)0, grid);
+	c_dirichlet_bc_z(avg.PV_flux, (T)0, (T)0, grid);
+
+	// - turbulent fluxes (3rd order): ui'ui'w' ...
+	// ---------------------------------------------------------------------------------- //
+	u2w_flux(avg.U2W_flux, avg.U2W, avg.U2_uw, avg.UW, avg.UW_adv, avg.U, avg.W, grid);
+	v2w_flux(avg.V2W_flux, avg.V2W, avg.V2_vw, avg.VW, avg.VW_adv, avg.V, avg.W, grid);
+	w2w_flux(avg.W2W_flux, avg.W2W, avg.W2_c, avg.W2_w, avg.W, grid);
+
+	c_dirichlet_bc_z(avg.W2W_flux, (T)0, (T)0, grid);
+
+	// - turbulent fluxes (3rd order): ui'uj'uk' ...
+	// ---------------------------------------------------------------------------------- //
+	uvw_flux(avg.UVW_flux, avg.UVW,
+		avg.UW, avg.VW, avg.UV_uvw, avg.UW_uvw, avg.VW_uvw,
+		avg.U, avg.V, avg.W, grid);
+
+	uww_flux(avg.UWW_flux, avg.UWW,
+		avg.W2_w, avg.W2_c, avg.W2_u, avg.W2_uw,
+		avg.UW, avg.UW_bottom_uw, avg.UW_top_uw,
+		avg.U, avg.W, grid);
+
+	vww_flux(avg.VWW_flux, avg.VWW,
+		avg.W2_w, avg.W2_c, avg.W2_v, avg.W2_vw,
+		avg.VW, avg.VW_bottom_vw, avg.VW_top_vw,
+		avg.V, avg.W, grid);
+
+	c_dirichlet_bc_z(avg.UWW_flux, (T)0, (T)0, grid);
+	c_dirichlet_bc_z(avg.VWW_flux, (T)0, (T)0, grid);
+
+	// - velocity gradients ...
+	// ---------------------------------------------------------------------------------- //
+	c_gradient_z(avg.U_grad, avg.U, axisZ, grid);
+	c_gradient_z(avg.V_grad, avg.V, axisZ, grid);
+	w_gradient_z(avg.W_grad, avg.W, axisZ, grid);
+
+
+#ifdef STRATIFICATION
+	// - deviations ...
+	// ---------------------------------------------------------------------------------- //
+	deviation(avg.T_deviation, avg.Tc, avg.T2_c, grid.nz);
+
+	// - turbulent fluxes (2nd order) ...
+	// ---------------------------------------------------------------------------------- //
+	cu_flux(avg.TU_flux, avg.TU, avg.Tc, avg.U, grid);
+	cv_flux(avg.TV_flux, avg.TV, avg.Tc, avg.V, grid);
+	cw_flux(avg.TW_flux, avg.TW, avg.Tc, avg.W, axisZ, grid);
+
+	cc_flux(avg.TP_flux, avg.TP, avg.Tc, avg.P, grid);
+	c_dirichlet_bc_z(avg.TP_flux, (T)0, (T)0, grid);
+
+	// - turbulent fluxes (3rd order): T'T'w' ...
+	// ---------------------------------------------------------------------------------- //
+	c2w_flux(avg.T2W_flux, avg.T2W, avg.T2_w, avg.TW, avg.TW_adv, avg.Tc, avg.W, grid);
+
+	// - turbulent fluxes (3rd order): T'ui'w' ...
+	// ---------------------------------------------------------------------------------- //
+	cuw_flux(avg.TUW_flux, avg.TUW,
+		avg.UW, avg.TU_uw, avg.TW, avg.TW_uw,
+		avg.Tc, avg.U, avg.W, grid);
+
+	cvw_flux(avg.TVW_flux, avg.TVW,
+		avg.VW, avg.TV_vw, avg.TW, avg.TW_vw,
+		avg.Tc, avg.V, avg.W, grid);
+
+	cww_flux(avg.TWW_flux, avg.TWW,
+		avg.W2_w, avg.W2_c,
+		avg.TW, avg.TW_bottom_w, avg.TW_top_w,
+		avg.Tc, avg.W, grid);
+
+	w_dirichlet_bc_z(avg.TUW_flux, (T)0, (T)0, grid);
+	w_dirichlet_bc_z(avg.TVW_flux, (T)0, (T)0, grid);
+	c_dirichlet_bc_z(avg.TWW_flux, (T)0, (T)0, grid);
+
+	// - temperature gradient ...
+	// ---------------------------------------------------------------------------------- //
+	c_gradient_z(avg.T_grad, avg.Tc, axisZ, grid);
+#endif
+
+
+#ifdef COMPUTE_XT_AVERAGES
+	// - boundary conditions for -yz velocity profiles ...
+	// ---------------------------------------------------------------------------------- //
+
+	// --- U(y,z), V(y,z), W(y,z) ...
+	c_dirichlet_bc_yz(avg.Uyz,
+		-(T)0.5 * fluid.Umax, (T)0.5 * fluid.Umax, grid);
+	c_dirichlet_bc_yz(avg.Vyz, (T)0, (T)0, grid);
+	w_dirichlet_bc_yz(avg.Wyz, (T)0, (T)0, grid);
+
+	// --- U^2(y,z), V^2(y,z), W^2(y,z) ...
+	c_dirichlet_bc_yz(avg.U2yz,
+		(T)0.25 * fluid.Umax * fluid.Umax,
+		(T)0.25 * fluid.Umax * fluid.Umax, grid);
+	c_dirichlet_bc_yz(avg.V2yz, (T)0, (T)0, grid);
+	w_dirichlet_bc_yz(avg.W2yz, (T)0, (T)0, grid);
+
+	// --- UV(y,z), UW(y,z), VW(y,z) ...
+	c_dirichlet_bc_yz(avg.UVyz, (T)0, (T)0, grid);
+	w_dirichlet_bc_yz(avg.UWyz, (T)0, (T)0, grid);
+	w_dirichlet_bc_yz(avg.VWyz, (T)0, (T)0, grid);
+	// ---------------------------------------------------------------------------------- //
+
+	// - deviations ...
+	// ---------------------------------------------------------------------------------- //
+	deviation(avg.Uyz_deviation, avg.Uyz, avg.U2yz, grid.nyz);
+	deviation(avg.Vyz_deviation, avg.Vyz, avg.V2yz, grid.nyz);
+	deviation(avg.Wyz_deviation, avg.Wyz, avg.W2yz, grid.nyz);
+
+	// - turbulent fluxes (2nd order) ...
+	// ---------------------------------------------------------------------------------- //
+	uv_flux(avg.UVyz_flux, avg.UVyz, avg.Uyz, avg.Vyz, axisYZ, grid);
+	uw_flux(avg.UWyz_flux, avg.UWyz, avg.Uyz, avg.Wyz, axisYZ, grid);
+	vw_flux(avg.VWyz_flux, avg.VWyz, avg.Vyz, avg.Wyz, axisYZ, grid);
+
+	// - velocity gradient ...
+	// ---------------------------------------------------------------------------------- //
+	c_gradient_z(avg.Uyz_grad, avg.Uyz, axisYZ, grid);
+	c_gradient_z(avg.Vyz_grad, avg.Vyz, axisYZ, grid);
+	w_gradient_z(avg.Wyz_grad, avg.Wyz, axisYZ, grid);
+
+	// - excluding [x,y,t] averages(z) ...
+	// ---------------------------------------------------------------------------------- //
+	int j, k, idx;
+
+	// --- U(y,z)[C-node] - U(z)[C-node]
+	for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+		idx = j * grid.nz + grid.gcz;
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+			avg.Uyz[idx] -= avg.U[k];
+		}
+	}
+	// --- V(y,z)[V-node] - V(z)[C-node]
+	for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+		idx = j * grid.nz + grid.gcz;
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+			avg.Vyz[idx] -= avg.V[k];
+		}
+	}
+	// --- W(y,z)[W-node] - W(z)[W-node]
+	for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+		idx = j * grid.nz + grid.gcz;
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+			avg.Wyz[idx] -= avg.W[k];
+		}
+	}
+	// ---------------------------------------------------------------------------------- //
+
+
+#ifdef STRATIFICATION
+	// - boundary conditions for -yz temperature profiles ...
+	// ---------------------------------------------------------------------------------- //
+
+	// --- T(y,z) ...
+	c_dirichlet_bc_yz(avg.Tyz, fluid.T0, fluid.TH, grid);
+
+	// --- T^2(y,z) ...
+	c_dirichlet_bc_yz(avg.T2yz,
+		fluid.T0 * fluid.T0, fluid.TH * fluid.TH, grid);
+
+	// --- TW(y,z) ...
+	w_dirichlet_bc_yz(avg.TWyz, (T)0, (T)0, grid);
+
+	// - deviations ...
+	//  *: using _abs as some near wall values are ~ -0E0
+	deviation_abs(avg.Tyz_deviation, avg.Tyz, avg.T2yz, grid.nyz);
+
+	// - turbulent fluxes (2nd order) ...
+	cw_flux(avg.TWyz_flux, avg.TWyz, avg.Tyz, avg.Wyz, axisYZ, grid);
+
+	// - temperature gradient ...
+	c_gradient_z(avg.Tyz_grad, avg.Tyz, axisYZ, grid);
+
+	// --- excluding [x,y,t] averages(z) ...
+	// ---------------------------------------------------------------------------------- //
+
+	// --- T(y,z)[C-node] - T(z)[C-node]
+	for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+		idx = j * grid.nz + grid.gcz;
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+			avg.Tyz[idx] -= avg.Tc[k];
+		}
+	}
+	// ---------------------------------------------------------------------------------- //
+#endif
+#endif
+
+
+	// - full momentum flux ...
+	// ---------------------------------------------------------------------------------- //
+	momentum_eq(avg.momentum_balance, avg.turbulent_momentum_flux, avg.viscous_stress,
+		avg.UW_flux, avg.U_grad, fluid.kinematic_viscosity, grid);
+
+
+	// - turbulent kinetic energy ...
+	// ---------------------------------------------------------------------------------- //
+
+	//    - structure ...
+	TKE_structure(avg.TKE,
+		avg.u_TKE, avg.v_TKE, avg.w_TKE,
+		avg.u_TKE_share, avg.v_TKE_share, avg.w_TKE_share,
+		avg.U2_u, avg.V2_v, avg.W2_c, avg.U, avg.V, avg.W, grid);
+
+	//    - production ...
+	u_TKE_production(avg.u_TKE_production,
+		avg.UW_bottom, avg.UW_top, avg.U, avg.W, grid);
+	v_TKE_production(avg.v_TKE_production,
+		avg.VW_bottom, avg.VW_top, avg.V, avg.W, grid);
+	w_TKE_production(avg.w_TKE_production,
+		avg.W2_c, avg.W2_w, avg.W, grid);
+
+	vsum(avg.TKE_production,
+		avg.u_TKE_production, avg.v_TKE_production, avg.w_TKE_production, grid.nz);
+
+	//    - diffusion [transport] ...
+	u_TKE_transport(avg.u_TKE_transport, avg.U2W_flux, grid);
+	v_TKE_transport(avg.v_TKE_transport, avg.V2W_flux, grid);
+	w_TKE_transport(avg.w_TKE_transport, avg.W2W_flux, grid);
+
+	vsum(avg.TKE_transport,
+		avg.u_TKE_transport, avg.v_TKE_transport, avg.w_TKE_transport, grid.nz);
+
+	//    - diffusion [pressure work] ...
+	null(avg.u_TKE_pressure_work, grid.nz);
+	null(avg.v_TKE_pressure_work, grid.nz);
+	w_TKE_pressure_work(avg.w_TKE_pressure_work, avg.PW_flux, grid);
+
+	vsum(avg.TKE_pressure_work,
+		avg.u_TKE_pressure_work, avg.v_TKE_pressure_work,
+		avg.w_TKE_pressure_work, grid.nz);
+
+	//    - diffusion ...
+	vsum(avg.u_TKE_diffusion, avg.u_TKE_transport, avg.u_TKE_pressure_work, grid.nz);
+	vsum(avg.v_TKE_diffusion, avg.v_TKE_transport, avg.v_TKE_pressure_work, grid.nz);
+	vsum(avg.w_TKE_diffusion, avg.w_TKE_transport, avg.w_TKE_pressure_work, grid.nz);
+
+	vsum(avg.TKE_diffusion, avg.TKE_transport, avg.TKE_pressure_work, grid.nz);
+
+	//	  - exchange: pressure-strain covariances ...
+	mcopy(avg.u_TKE_exchange, avg.PSuu, grid.nz);
+	mcopy(avg.v_TKE_exchange, avg.PSvv, grid.nz);
+
+	w_TKE_exchange(avg.w_TKE_exchange,
+		avg.PSww, avg.P, avg.W, grid);
+
+	vsum(avg.TKE_exchange_balance,
+		avg.u_TKE_exchange, avg.v_TKE_exchange, avg.w_TKE_exchange, grid.nz);
+
+#ifdef STRATIFICATION
+	//	  - heat flux ...
+	w_TKE_heat_flux(avg.w_TKE_heat_flux, avg.TW_flux,
+		fluid.Richardson, grid);
+
+	mcopy(avg.TKE_heat_flux, avg.w_TKE_heat_flux, grid.nz);
+#endif
+
+	//    - dissipation ...
+	u_TKE_dissipation(avg.u_TKE_dissipation, avg.U_diss, avg.U,
+		fluid.kinematic_viscosity, grid);
+	v_TKE_dissipation(avg.v_TKE_dissipation, avg.V_diss, avg.V,
+		fluid.kinematic_viscosity, grid);
+	w_TKE_dissipation(avg.w_TKE_dissipation, avg.W_diss, avg.W,
+		fluid.kinematic_viscosity, grid);
+
+	vsum(avg.TKE_dissipation,
+		avg.u_TKE_dissipation, avg.v_TKE_dissipation, avg.w_TKE_dissipation, grid.nz);
+
+	//    - iso-dissipation ...
+	u_TKE_iso_dissipation(avg.u_TKE_iso_dissipation, avg.U_iso_diss, avg.U,
+		fluid.kinematic_viscosity, grid);
+	v_TKE_iso_dissipation(avg.v_TKE_iso_dissipation, avg.V_iso_diss, avg.V,
+		fluid.kinematic_viscosity, grid);
+	w_TKE_iso_dissipation(avg.w_TKE_iso_dissipation, avg.W_iso_diss, avg.W,
+		fluid.kinematic_viscosity, grid);
+
+	vsum(avg.TKE_iso_dissipation,
+		avg.u_TKE_iso_dissipation, avg.v_TKE_iso_dissipation, avg.w_TKE_iso_dissipation, grid.nz);
+
+	//	  - balances ...
+	vsum(avg.u_TKE_balance,
+		avg.u_TKE_production, avg.u_TKE_diffusion,
+		avg.u_TKE_exchange, avg.u_TKE_dissipation,
+		grid.nz);
+
+	vsum(avg.v_TKE_balance,
+		avg.v_TKE_production, avg.v_TKE_diffusion,
+		avg.v_TKE_exchange, avg.v_TKE_dissipation,
+		grid.nz);
+
+	vsum(avg.w_TKE_balance,
+		avg.w_TKE_production, avg.w_TKE_diffusion,
+		avg.w_TKE_exchange,
+#ifdef STRATIFICATION
+		avg.w_TKE_heat_flux,
+#endif
+		avg.w_TKE_dissipation,
+		grid.nz);
+
+	vsum(avg.TKE_balance,
+		avg.TKE_production, avg.TKE_diffusion,
+		avg.TKE_exchange_balance,
+#ifdef STRATIFICATION
+		avg.TKE_heat_flux,
+#endif
+		avg.TKE_dissipation,
+		grid.nz);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// ui'uj' budgets ...
+	// ---------------------------------------------------------------------------------- //
+
+	//	  - [u'v'] production ...
+	uv_production_shearU(avg.uv_production_shearU,
+		avg.VW_bottom_uv, avg.VW_top_uv, avg.U, avg.V, avg.W, grid);
+	uv_production_shearV(avg.uv_production_shearV,
+		avg.UW_bottom_uv, avg.UW_top_uv, avg.U, avg.V, avg.W, grid);
+
+	vsum(avg.uv_production, avg.uv_production_shearU, avg.uv_production_shearV, grid.nz);
+
+	//	  - [u'w'] production ...
+	uw_production_shearU(avg.uw_production_shearU,
+		avg.W2_u, avg.W2_uw, avg.U, avg.W, grid);
+	uw_production_shearW(avg.uw_production_shearW,
+		avg.UW_bottom_uw, avg.UW_top_uw, avg.U, avg.W, grid);
+
+	vsum(avg.uw_production, avg.uw_production_shearU, avg.uw_production_shearW, grid.nz);
+
+	//	  - [v'w'] production ...
+	vw_production_shearV(avg.vw_production_shearV,
+		avg.W2_v, avg.W2_vw, avg.V, avg.W, grid);
+	vw_production_shearW(avg.vw_production_shearW,
+		avg.VW_bottom_vw, avg.VW_top_vw, avg.V, avg.W, grid);
+
+	vsum(avg.vw_production, avg.vw_production_shearV, avg.vw_production_shearW, grid.nz);
+
+	//	  - diffusion [transport] ...
+	uv_transport(avg.uv_transport, avg.UVW_flux, grid);
+	uw_transport(avg.uw_transport, avg.UWW_flux, grid);
+	vw_transport(avg.vw_transport, avg.VWW_flux, grid);
+
+	//	  - diffusion [pressure work] ...
+	null(avg.uv_pressure_work, grid.nz);
+	uw_pressure_work(avg.uw_pressure_work, avg.PU_flux, grid);
+	vw_pressure_work(avg.vw_pressure_work, avg.PV_flux, grid);
+
+	//	  - diffusion ...
+	vsum(avg.uv_diffusion, avg.uv_transport, avg.uv_pressure_work, grid.nz);
+	vsum(avg.uw_diffusion, avg.uw_transport, avg.uw_pressure_work, grid.nz);
+	vsum(avg.vw_diffusion, avg.vw_transport, avg.vw_pressure_work, grid.nz);
+
+	//	  - pressure-strain covariances ...
+	mcopy(avg.P2Suv_turb, avg.P2Suv, grid.nz);
+
+	uw_pressure_strain(avg.P2Suw_turb, avg.P2Suw_turb_c,	// (shifting [W] -> [C])
+		avg.P2Suw, avg.P, avg.U, grid);
+
+	vw_pressure_strain(avg.P2Svw_turb, avg.P2Svw_turb_c,	// (shifting [W] -> [C])
+		avg.P2Svw, avg.P, avg.V, grid);
+
+#ifdef STRATIFICATION
+	//	  - buoyancy ...
+	uw_buoyancy(avg.uw_buoyancy,
+		avg.TU_uw, avg.Tc, avg.U, fluid.Richardson, grid);
+
+	vw_buoyancy(avg.vw_buoyancy,
+		avg.TV_vw, avg.Tc, avg.V, fluid.Richardson, grid);
+#endif
+
+	//	  - [u'v'] dissipation ...
+	uv_dissipation(avg.uv_dissipation,
+		avg.UV_diss, avg.U, avg.V, fluid.kinematic_viscosity, grid);
+
+	uv_iso_dissipation(avg.uv_iso_dissipation,
+		avg.UV_iso_diss, avg.U, avg.V, fluid.kinematic_viscosity, grid);
+
+	//	  - [u'w'] dissipation ...
+	uw_dissipation(avg.uw_dissipation,
+		avg.UW_diss, avg.U, avg.W, fluid.kinematic_viscosity, grid);
+
+	uw_iso_dissipation(avg.uw_iso_dissipation,
+		avg.UW_iso_diss, avg.U, avg.W, fluid.kinematic_viscosity, grid);
+
+	//	  - [v'w'] dissipation ...
+	vw_dissipation(avg.vw_dissipation,
+		avg.VW_diss, avg.V, avg.W, fluid.kinematic_viscosity, grid);
+
+	vw_iso_dissipation(avg.vw_iso_dissipation,
+		avg.VW_iso_diss, avg.V, avg.W, fluid.kinematic_viscosity, grid);
+
+	//	  - balances ...
+	vsum(avg.uv_budget_balance,
+		avg.uv_production, avg.uv_diffusion, avg.P2Suv_turb,
+		avg.uv_dissipation,
+		grid.nz);
+
+	vsum(avg.uw_budget_balance,
+		avg.uw_production, avg.uw_diffusion, avg.P2Suw_turb,
+#ifdef STRATIFICATION
+		avg.uw_buoyancy,
+#endif
+		avg.uw_dissipation,
+		grid.nz);
+
+	vsum(avg.vw_budget_balance,
+		avg.vw_production, avg.vw_diffusion, avg.P2Svw_turb,
+#ifdef STRATIFICATION
+		avg.vw_buoyancy,
+#endif
+		avg.vw_dissipation,
+		grid.nz);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - TKE anisotropy tensor ...
+	// ---------------------------------------------------------------------------------- //
+	TKE_anisotropy(avg.TKE_aniso_uu, avg.TKE_aniso_vv, avg.TKE_aniso_ww,
+		avg.TKE_aniso_uv, avg.TKE_aniso_uw, avg.TKE_aniso_vw,
+		avg.TKE, avg.u_TKE, avg.v_TKE, avg.w_TKE,
+		avg.UV_flux, avg.UW_flux, avg.VW_flux, grid);
+
+
+	// - TKE Rotta "return-to-isotropy" model constants ...
+	// ---------------------------------------------------------------------------------- //
+	Rotta_model(avg.u_Rotta, avg.v_Rotta, avg.w_Rotta, avg.uw_Rotta,
+		avg.TKE, avg.TKE_iso_dissipation,
+		avg.u_TKE, avg.v_TKE, avg.w_TKE, avg.UW_flux,
+		avg.u_TKE_exchange, avg.v_TKE_exchange, avg.w_TKE_exchange,
+		avg.P2Suw_turb_c, grid);
+
+
+	// - TKE RDT "return-to-isotropy" model constants ...
+	// ---------------------------------------------------------------------------------- //
+	RDT_model(avg.u_RDT, avg.v_RDT, avg.w_RDT, avg.uw_RDT,
+		avg.TKE_production, avg.W2_w, avg.U, avg.W,
+		avg.u_TKE_exchange, avg.v_TKE_exchange, avg.w_TKE_exchange,
+		avg.P2Suw_turb_c, grid);
+
+
+	// - TKE Rotta-RDT "return-to-isotropy" model constants ...
+	// ---------------------------------------------------------------------------------- //
+	Rotta_RDT_model(avg.Rotta_RDT_e, avg.Rotta_RDT_p,
+		avg.TKE, avg.TKE_iso_dissipation, avg.TKE_production,
+		avg.u_TKE, avg.v_TKE, avg.w_TKE,
+		avg.u_TKE_exchange, avg.v_TKE_exchange, avg.w_TKE_exchange, grid);
+
+
+#ifdef STRATIFICATION
+	// - heat flux balance eq ...
+	// ---------------------------------------------------------------------------------- //
+	heat_eq(avg.heat_balance, avg.turbulent_heat_flux, avg.heat_stress,
+		avg.TW_flux, avg.T_grad, fluid.diffusivity, grid);
+
+
+	// - Temperature variance balance ...
+	// ---------------------------------------------------------------------------------- //
+
+	//    - production ...
+	SVA_production(avg.TVA_production,
+		avg.TW_bottom, avg.TW_top, avg.Tc, avg.W, grid);
+
+	//    - diffusion ...
+	SVA_transport(avg.TVA_transport, avg.T2W_flux, grid);
+
+	//    - dissipation ...
+	SVA_dissipation(avg.TVA_dissipation,
+		avg.T_diss, avg.Tsh, fluid.diffusivity, grid);
+
+	//    - iso dissipation ...
+	SVA_iso_dissipation(avg.TVA_iso_dissipation,
+		avg.T_iso_diss, avg.Tsh, fluid.diffusivity, grid);
+
+	//    - balance ...
+	vsum(avg.TVA_balance,
+		avg.TVA_production, avg.TVA_transport, avg.TVA_dissipation,
+		grid.nz);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - Potential energy ...
+	// ---------------------------------------------------------------------------------- //
+	TPE_structure(avg.TPE,
+		avg.TPE_heat_flux, avg.TPE_transport,
+		avg.TPE_dissipation, avg.TPE_iso_dissipation,
+
+		avg.TVA_production, avg.TVA_transport,
+		avg.TVA_dissipation, avg.TVA_iso_dissipation,
+
+		avg.T2_c, avg.Tc, avg.T_grad,
+		fluid.Richardson, grid);
+
+	vsum(avg.TPE_balance,
+		avg.TPE_heat_flux, avg.TPE_transport, avg.TPE_dissipation,
+		grid.nz);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - Energy structure ...
+	// ---------------------------------------------------------------------------------- //
+	energy_structure(avg.TKE_share, avg.TPE_share,
+		avg.TKE, avg.TPE, grid);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// T'ui' budgets ...
+	// ---------------------------------------------------------------------------------- //
+
+	//	  - [T'u'] production ...
+	cu_production_gradC(avg.Tu_production_gradT,
+		avg.UW_bottom, avg.UW_top, avg.Tc, avg.U, avg.W, grid);
+	cu_production_shear(avg.Tu_production_shear,
+		avg.TW_bottom_u, avg.TW_top_u, avg.Tc, avg.U, avg.W, grid);
+
+	vsum(avg.Tu_production, avg.Tu_production_gradT, avg.Tu_production_shear, grid.nz);
+
+	//	  - [T'v'] production ...
+	cv_production_gradC(avg.Tv_production_gradT,
+		avg.VW_bottom, avg.VW_top, avg.Tc, avg.V, avg.W, grid);
+	cv_production_shear(avg.Tv_production_shear,
+		avg.TW_bottom_v, avg.TW_top_v, avg.Tc, avg.V, avg.W, grid);
+
+	vsum(avg.Tv_production, avg.Tv_production_gradT, avg.Tv_production_shear, grid.nz);
+
+	//	  - [T'w'] production ...
+	cw_production_gradC(avg.Tw_production_gradT,
+		avg.W2_w, avg.W2_c, avg.Tc, avg.W, grid);
+	cw_production_shear(avg.Tw_production_shear,
+		avg.TW_bottom_w, avg.TW_top_w, avg.Tc, avg.W, grid);
+
+	vsum(avg.Tw_production, avg.Tw_production_gradT, avg.Tw_production_shear, grid.nz);
+
+	//	  - diffusion [transport] ...
+	cu_transport(avg.Tu_transport, avg.TUW_flux, grid);
+	cv_transport(avg.Tv_transport, avg.TVW_flux, grid);
+	cw_transport(avg.Tw_transport, avg.TWW_flux, grid);
+
+	//	  - diffusion [pressure work] ...
+	null(avg.Tu_pressure_work, grid.nz);
+	null(avg.Tv_pressure_work, grid.nz);
+	cw_pressure_work(avg.Tw_pressure_work, avg.TP_flux, grid);
+
+	//	  - diffusion ...
+	vsum(avg.Tu_diffusion, avg.Tu_transport, avg.Tu_pressure_work, grid.nz);
+	vsum(avg.Tv_diffusion, avg.Tv_transport, avg.Tv_pressure_work, grid.nz);
+	vsum(avg.Tw_diffusion, avg.Tw_transport, avg.Tw_pressure_work, grid.nz);
+
+	//	  - T * grad(P) covariance ...
+	mcopy(avg.T_dpdx_turb, avg.T_dPdx, grid.nz);
+	mcopy(avg.T_dpdy_turb, avg.T_dPdy, grid.nz);
+
+	c_w_pressure_gradient_turb(avg.T_dpdz_turb,
+		avg.T_dPdz, avg.Tc, avg.P, grid);
+
+	//	  - P * grad(T) covariance ...
+	cu_pressure_gradc(avg.Tu_pressure_gradT,
+		avg.T_dpdx_turb, grid);
+
+	cv_pressure_gradc(avg.Tv_pressure_gradT,
+		avg.T_dpdy_turb, grid);
+
+	cw_pressure_gradc(avg.Tw_pressure_gradT,
+		avg.Tw_pressure_work, avg.T_dpdz_turb, grid);
+
+	//	  - buoyancy ...
+	cw_buoyancy(avg.Tw_buoyancy,
+		avg.T2_c, avg.T2_w, avg.Tc, fluid.Richardson, grid);
+
+	//	  - dissipation ...
+	cu_dissipation(avg.Tu_dissipation,
+		avg.TU_diss, avg.Tsh, avg.U,
+		fluid.diffusivity, fluid.kinematic_viscosity, grid);
+
+	cv_dissipation(avg.Tv_dissipation,
+		avg.TV_diss, avg.Tsh, avg.V,
+		fluid.diffusivity, fluid.kinematic_viscosity, grid);
+
+	cw_dissipation(avg.Tw_dissipation,
+		avg.TW_diss, avg.Tsh, avg.W,
+		fluid.diffusivity, fluid.kinematic_viscosity, grid);
+
+	//	  - balance ...
+	vsum(avg.Tu_budget_balance,
+		avg.Tu_production, avg.Tu_diffusion, avg.Tu_pressure_gradT,
+		avg.Tu_dissipation,
+		grid.nz);
+
+	vsum(avg.Tv_budget_balance,
+		avg.Tv_production, avg.Tv_diffusion, avg.Tv_pressure_gradT,
+		avg.Tv_dissipation,
+		grid.nz);
+
+	vsum(avg.Tw_budget_balance,
+		avg.Tw_production, avg.Tw_diffusion, avg.Tw_pressure_gradT,
+		avg.Tw_buoyancy, avg.Tw_dissipation,
+		grid.nz);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - TKE Rotta-buoyancy "return-to-isotropy" model constants ...
+	// ---------------------------------------------------------------------------------- //
+	Rotta_buoyancy_model(avg.Rotta_buoyancy_e, avg.Rotta_buoyancy_b,
+		avg.TKE, avg.TKE_iso_dissipation, avg.TKE_heat_flux,
+		avg.u_TKE, avg.v_TKE, avg.w_TKE,
+		avg.u_TKE_exchange, avg.v_TKE_exchange, avg.w_TKE_exchange,
+		fluid.Richardson, grid);
+
+
+	// - TKE RDT-buoyancy "return-to-isotropy" model constants ...
+	// ---------------------------------------------------------------------------------- //
+	RDT_buoyancy_model(avg.RDT_buoyancy_p, avg.RDT_buoyancy_b,
+		avg.TKE_production, avg.TKE_heat_flux,
+		avg.u_TKE_exchange, avg.v_TKE_exchange, avg.w_TKE_exchange,
+		fluid.Richardson, grid);
+
+
+	// - TKE Rotta-TPE "return-to-isotropy" model constants ...
+	//  *: return to isotropy using TKE-TPE ...
+	// ---------------------------------------------------------------------------------- //
+	Rotta_TPE_model(avg.u_Rotta_TPE, avg.v_Rotta_TPE, avg.w_Rotta_TPE,
+		avg.TKE, avg.TPE,
+		avg.TKE_iso_dissipation, avg.TPE_iso_dissipation,
+		avg.u_TKE, avg.v_TKE, avg.w_TKE,
+		avg.u_TKE_exchange, avg.v_TKE_exchange, avg.w_TKE_exchange, grid);
+#endif
+
+
+	// - turbulent (TKE) time scale ...
+	// ---------------------------------------------------------------------------------- //
+	time_scale_turbulent(avg.time_scale_turbulent,
+		avg.TKE, avg.TKE_iso_dissipation, grid);
+
+
+	// - turbulence mixing length scale ...
+	// ---------------------------------------------------------------------------------- //
+	length_scale_mixing(avg.length_scale_mixing,
+		avg.U2_uw, avg.V2_vw, avg.W2_w, avg.U, avg.V, avg.W,
+		avg.U_grad, grid);
+
+
+	// - Kolmogorov length scale ...
+	// ---------------------------------------------------------------------------------- //
+	length_scale_kolmogorov(avg.length_scale_kolmogorov,
+		avg.TKE_iso_dissipation, fluid.kinematic_viscosity, grid);
+
+
+#ifdef STRATIFICATION
+	// - turbulent Prandtl number ...
+	// ---------------------------------------------------------------------------------- //
+	prandtl_turbulent(avg.Prandtl_turbulent,
+		avg.UW_flux, avg.U_grad, avg.TW_flux, avg.T_grad, grid);
+
+
+	// - gradient Richardson number ...
+	// ---------------------------------------------------------------------------------- //
+	richardson_gradient(avg.Richardson_gradient,
+		avg.U_grad, avg.T_grad, fluid.Richardson, axisZ, grid);
+
+
+	// - gradient Richardson number (on -yz plane) ...
+	// ---------------------------------------------------------------------------------- //
+#ifdef COMPUTE_XT_AVERAGES
+	richardson_gradient(avg.Richardson_gradient_yz,
+		avg.Uyz_grad, avg.Tyz_grad, fluid.Richardson, axisYZ, grid);
+#endif
+
+
+	// - flux Richardson number ...
+	// ---------------------------------------------------------------------------------- //
+	richardson_flux(avg.Richardson_flux,
+		avg.UW_flux, avg.U_grad, avg.TW_flux,
+		fluid.Richardson, grid);
+
+
+	// - Reynolds buoyancy number ...
+	// ---------------------------------------------------------------------------------- //
+	reynolds_buoyancy(avg.Reynolds_buoyancy,
+		avg.TKE_iso_dissipation, avg.T_grad,
+		fluid.Richardson, fluid.kinematic_viscosity, grid);
+
+
+	// - Froude horizontal number ...
+	// ---------------------------------------------------------------------------------- //
+	froude_horizontal(avg.Froude_horizontal,
+		avg.u_TKE, avg.v_TKE,
+		avg.TKE_iso_dissipation, avg.T_grad,
+		fluid.Richardson, grid);
+
+
+	// - temperature variance time scale ...
+	// ---------------------------------------------------------------------------------- //
+	time_scale_svariance(avg.time_scale_Tvariance,
+		avg.T2_c, avg.Tc, avg.TVA_iso_dissipation, grid);
+
+
+	// - Ellison (overturning) length scale ...
+	// ---------------------------------------------------------------------------------- //
+	length_scale_ellison(avg.length_scale_ellison,
+		avg.T2_w, avg.Tc, avg.T_grad, grid);
+
+
+	// - Ozmidov length scale ...
+	// ---------------------------------------------------------------------------------- //
+	length_scale_ozmidov(avg.length_scale_ozmidov,
+		avg.TKE_iso_dissipation, avg.T_grad,
+		fluid.Richardson, grid);
+
+
+	// - Obukhov length scale ...
+	// ---------------------------------------------------------------------------------- //
+	length_scale_obukhov(avg.length_scale_obukhov,
+		avg.UW_flux, avg.TW_flux, fluid.Richardson, grid);
+
+
+	// - instanteneous mixing efficiency ...
+	// ---------------------------------------------------------------------------------- //
+	mixing_efficiency(avg.mixing_efficiency,
+		avg.TKE_iso_dissipation, avg.TVA_iso_dissipation, avg.T_grad,
+		fluid.Richardson, grid);
+
+
+	// - turbulence (TKE-to-TVA) production ratio ...
+	// ---------------------------------------------------------------------------------- //
+	turbulence_production_ratio(avg.turb_production_ratio,
+		avg.TKE_production, avg.TVA_production, grid);
+#endif
+
+
+#ifdef FOURIER_SPECTRUM
+	// - Fourier spectrum
+	// ---------------------------------------------------------------------------------- //
+
+	const T zpos = (T)0.22;
+
+	velocity_energy_spectrum_x(spectrum.DFT_U2_x[0],
+		spectrum.DFT_wavenumber_x, U, zpos, grid);
+	velocity_energy_spectrum_x(spectrum.DFT_U2_x[1],
+		spectrum.DFT_wavenumber_x, V, zpos, grid);
+	velocity_energy_spectrum_x(spectrum.DFT_U2_x[2],
+		spectrum.DFT_wavenumber_x, W, zpos, grid);
+
+	for (int i = 0; i < grid.nx; i++) {
+		spectrum.DFT_U2_x[3][i] = (T)0.5 * (
+			spectrum.DFT_U2_x[0][i] +
+			spectrum.DFT_U2_x[1][i] +
+			spectrum.DFT_U2_x[2][i]);
+	}
+	// ---------------------------------------------------------------------------------- //
+#endif
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::write_statistics_output(
+	const int index, nseTurbVec< T >& avg)
+{
+	T u_dynamic;     // dynamic (friction) velocity
+	T z_visc;        // -z viscous spacing
+
+						// - calculating dynamic velocity for normalization ... //
+	u_dynamic = ::dynamic_velocity(avg.U, fluid.Umax, fluid.kinematic_viscosity, grid);
+	z_visc = fluid.kinematic_viscosity / u_dynamic;
+
+
+	// - output [U(z), V(z), W(z)] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.U_AVERAGE_PZ_FILE, index,
+		avg.U, "U", axisZ, nodeC, grid, current_time);
+	write_tecplot_1d(stats.output.V_AVERAGE_PZ_FILE, index,
+		avg.V, "V", axisZ, nodeC, grid, current_time);
+	write_tecplot_1d(stats.output.W_AVERAGE_PZ_FILE, index,
+		avg.W, "W", axisZ, nodeW, grid, current_time);
+
+	write_tecplot_1d(stats.output.U_AVERAGE_VISC_PZ_FILE, index,
+		avg.U, "U+(z+)", u_dynamic, z_visc, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [sqrt(ui'ui')] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.U_DEVIATION_PZ_FILE, index,
+		avg.U_deviation, "(u'u')^1/2", axisZ, nodeC, grid, current_time);
+	write_tecplot_1d(stats.output.V_DEVIATION_PZ_FILE, index,
+		avg.V_deviation, "(v'v')^1/2", axisZ, nodeC, grid, current_time);
+	write_tecplot_1d(stats.output.W_DEVIATION_PZ_FILE, index,
+		avg.W_deviation, "(w'w')^1/2", axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [ui'uj'] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.UV_FLUX_PZ_FILE, index,
+		avg.UV_flux, "u'v'", axisZ, nodeC, grid, current_time);
+	write_tecplot_1d(stats.output.UW_FLUX_PZ_FILE, index,
+		avg.UW_flux, "u'w'", axisZ, nodeW, grid, current_time);
+	write_tecplot_1d(stats.output.VW_FLUX_PZ_FILE, index,
+		avg.VW_flux, "v'w'", axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [dU/dz] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.U_GRAD_PZ_FILE, index,
+		avg.U_grad, "dU/dz", axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+#ifdef COMPUTE_XT_AVERAGES
+	// - output [U(y,z), V(y,z), W(y,z)] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_2d(stats.output.U_AVERAGE_PYZ_FILE, index,
+		avg.Uyz, "U(y,z)", axisYZ, nodeC, grid, current_time);
+	write_tecplot_2d(stats.output.V_W_AVERAGE_PYZ_FILE, index,
+		avg.Vyz, avg.Wyz, "V(y,z)", "W(y,z)", axisYZ, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [sqrt(ui'ui') on -yz plane] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_2d(stats.output.U_DEVIATION_PYZ_FILE, index,
+		avg.Uyz_deviation, "(u'u')^1/2[y,z]", axisYZ, nodeC, grid, current_time);
+	write_tecplot_2d(stats.output.V_W_DEVIATION_PYZ_FILE, index,
+		avg.Vyz_deviation, avg.Wyz_deviation,
+		"(v'v')^1/2[y,z]", "(w'w')^1/2[y,z]", axisYZ, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [ui'uj' on -yz plane] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_2d(stats.output.UV_FLUX_PYZ_FILE, index,
+		avg.UVyz_flux, "u'v'[y,z]", axisYZ, nodeV, grid, current_time);
+	write_tecplot_2d(stats.output.UW_FLUX_PYZ_FILE, index,
+		avg.UWyz_flux, "u'w'[y,z]", axisYZ, nodeW, grid, current_time);
+	write_tecplot_2d(stats.output.VW_FLUX_PYZ_FILE, index,
+		avg.VWyz_flux, "v'w'[y,z]", axisYZ, nodeVW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [dU/dz on -yz plane] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_2d(stats.output.U_GRAD_PYZ_FILE, index,
+		avg.Uyz_grad, "dU/dz[y,z]", axisYZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+#endif
+
+
+	// - output [momentum balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* momentum_name[] = { "momentum balance",
+		"turbulent momentum flux", "viscous stress" };
+	T *momentum_vec[3];
+	momentum_vec[0] = avg.momentum_balance;
+	momentum_vec[1] = avg.turbulent_momentum_flux;
+	momentum_vec[2] = avg.viscous_stress;
+
+	write_tecplot_1d(stats.output.MOMENTUM_BALANCE_PZ_FILE, index,
+		momentum_vec, momentum_name, 3, axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [-u TKE balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* u_TKE_balance_name[] = { "[u'u'/2] balance",
+		"[u'u'/2] production", "[u'u'/2] diffusion", "[u'u'/2] exchange",
+		"[u'u'/2] dissipation", "[u'u'/2] iso-dissipation",
+		"[u'u'/2] transport", "[u'u'/2] pressure work" };
+	T* u_TKE_balance_vec[8];
+	u_TKE_balance_vec[0] = avg.u_TKE_balance;
+	u_TKE_balance_vec[1] = avg.u_TKE_production;
+	u_TKE_balance_vec[2] = avg.u_TKE_diffusion;
+	u_TKE_balance_vec[3] = avg.u_TKE_exchange;
+	u_TKE_balance_vec[4] = avg.u_TKE_dissipation;
+	u_TKE_balance_vec[5] = avg.u_TKE_iso_dissipation;
+	u_TKE_balance_vec[6] = avg.u_TKE_transport;
+	u_TKE_balance_vec[7] = avg.u_TKE_pressure_work;
+
+	write_tecplot_1d(stats.output.U_TKE_BALANCE_PZ_FILE, index,
+		u_TKE_balance_vec, u_TKE_balance_name, 8, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [-v TKE balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* v_TKE_balance_name[] = { "[v'v'/2] balance",
+		"[v'v'/2] production", "[v'v'/2] diffusion", "[v'v'/2] exchange",
+		"[v'v'/2] dissipation", "[v'v'/2] iso-dissipation",
+		"[v'v'/2] transport", "[v'v'/2] pressure work" };
+	T* v_TKE_balance_vec[8];
+	v_TKE_balance_vec[0] = avg.v_TKE_balance;
+	v_TKE_balance_vec[1] = avg.v_TKE_production;
+	v_TKE_balance_vec[2] = avg.v_TKE_diffusion;
+	v_TKE_balance_vec[3] = avg.v_TKE_exchange;
+	v_TKE_balance_vec[4] = avg.v_TKE_dissipation;
+	v_TKE_balance_vec[5] = avg.v_TKE_iso_dissipation;
+	v_TKE_balance_vec[6] = avg.v_TKE_transport;
+	v_TKE_balance_vec[7] = avg.v_TKE_pressure_work;
+
+	write_tecplot_1d(stats.output.V_TKE_BALANCE_PZ_FILE, index,
+		v_TKE_balance_vec, v_TKE_balance_name, 8, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#ifdef STRATIFICATION
+	// - output [-w TKE balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* w_TKE_balance_name[] = { "[w'w'/2] balance",
+		"[w'w'/2] production", "[w'w'/2] diffusion", "[w'w'/2] exchange",
+		"[w'w'/2] heat flux",
+		"[w'w'/2] dissipation", "[w'w'/2] iso-dissipation",
+		"[w'w'/2] transport", "[w'w'/2] pressure work" };
+	T* w_TKE_balance_vec[9];
+	w_TKE_balance_vec[0] = avg.w_TKE_balance;
+	w_TKE_balance_vec[1] = avg.w_TKE_production;
+	w_TKE_balance_vec[2] = avg.w_TKE_diffusion;
+	w_TKE_balance_vec[3] = avg.w_TKE_exchange;
+	w_TKE_balance_vec[4] = avg.w_TKE_heat_flux;
+	w_TKE_balance_vec[5] = avg.w_TKE_dissipation;
+	w_TKE_balance_vec[6] = avg.w_TKE_iso_dissipation;
+	w_TKE_balance_vec[7] = avg.w_TKE_transport;
+	w_TKE_balance_vec[8] = avg.w_TKE_pressure_work;
+
+	write_tecplot_1d(stats.output.W_TKE_BALANCE_PZ_FILE, index,
+		w_TKE_balance_vec, w_TKE_balance_name, 9, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#else
+
+	// - output [-w TKE balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* w_TKE_balance_name[] = { "[w'w'/2] balance",
+		"[w'w'/2] production", "[w'w'/2] diffusion", "[w'w'/2] exchange",
+		"[w'w'/2] dissipation", "[w'w'/2] iso-dissipation",
+		"[w'w'/2] transport", "[w'w'/2] pressure work" };
+	T* w_TKE_balance_vec[8];
+	w_TKE_balance_vec[0] = avg.w_TKE_balance;
+	w_TKE_balance_vec[1] = avg.w_TKE_production;
+	w_TKE_balance_vec[2] = avg.w_TKE_diffusion;
+	w_TKE_balance_vec[3] = avg.w_TKE_exchange;
+	w_TKE_balance_vec[4] = avg.w_TKE_dissipation;
+	w_TKE_balance_vec[5] = avg.w_TKE_iso_dissipation;
+	w_TKE_balance_vec[6] = avg.w_TKE_transport;
+	w_TKE_balance_vec[7] = avg.w_TKE_pressure_work;
+
+	write_tecplot_1d(stats.output.W_TKE_BALANCE_PZ_FILE, index,
+		w_TKE_balance_vec, w_TKE_balance_name, 8, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#endif
+
+#ifdef STRATIFICATION
+	// - output [TKE balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* TKE_balance_name[] = { "TKE balance",
+		"TKE production", "TKE diffusion", "TKE exchange balance",
+		"TKE heat flux",
+		"TKE dissipation", "TKE iso-dissipation",
+		"TKE transport", "TKE pressure work" };
+	T* TKE_balance_vec[9];
+	TKE_balance_vec[0] = avg.TKE_balance;
+	TKE_balance_vec[1] = avg.TKE_production;
+	TKE_balance_vec[2] = avg.TKE_diffusion;
+	TKE_balance_vec[3] = avg.TKE_exchange_balance;
+	TKE_balance_vec[4] = avg.TKE_heat_flux;
+	TKE_balance_vec[5] = avg.TKE_dissipation;
+	TKE_balance_vec[6] = avg.TKE_iso_dissipation;
+	TKE_balance_vec[7] = avg.TKE_transport;
+	TKE_balance_vec[8] = avg.TKE_pressure_work;
+
+	write_tecplot_1d(stats.output.TKE_BALANCE_PZ_FILE, index,
+		TKE_balance_vec, TKE_balance_name, 9, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#else
+
+	// - output [TKE balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* TKE_balance_name[] = { "TKE balance",
+		"TKE production", "TKE diffusion", "TKE exchange balance",
+		"TKE dissipation", "TKE iso-dissipation",
+		"TKE transport", "TKE pressure work" };
+	T* TKE_balance_vec[8];
+	TKE_balance_vec[0] = avg.TKE_balance;
+	TKE_balance_vec[1] = avg.TKE_production;
+	TKE_balance_vec[2] = avg.TKE_diffusion;
+	TKE_balance_vec[3] = avg.TKE_exchange_balance;
+	TKE_balance_vec[4] = avg.TKE_dissipation;
+	TKE_balance_vec[5] = avg.TKE_iso_dissipation;
+	TKE_balance_vec[6] = avg.TKE_transport;
+	TKE_balance_vec[7] = avg.TKE_pressure_work;
+
+	write_tecplot_1d(stats.output.TKE_BALANCE_PZ_FILE, index,
+		TKE_balance_vec, TKE_balance_name, 8, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#endif
+
+#ifdef STRATIFICATION
+	// - output [energy structure] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* energy_structure_name[] = {
+		"TKE", "TPE", "TKE, share", "TPE, share",
+		"u-TKE", "v-TKE", "w-TKE",
+		"u-TKE, share", "v-TKE, share", "w-TKE, share" };
+	T *energy_structure_vec[10];
+	energy_structure_vec[0] = avg.TKE;
+	energy_structure_vec[1] = avg.TPE;
+	energy_structure_vec[2] = avg.TKE_share;
+	energy_structure_vec[3] = avg.TPE_share;
+	energy_structure_vec[4] = avg.u_TKE;
+	energy_structure_vec[5] = avg.v_TKE;
+	energy_structure_vec[6] = avg.w_TKE;
+	energy_structure_vec[7] = avg.u_TKE_share;
+	energy_structure_vec[8] = avg.v_TKE_share;
+	energy_structure_vec[9] = avg.w_TKE_share;
+
+	write_tecplot_1d(stats.output.ENERGY_STRUCTURE_PZ_FILE, index,
+		energy_structure_vec, energy_structure_name, 10, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#else
+
+	// - output [energy structure] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* energy_structure_name[] = { "TKE",
+		"u-TKE", "v-TKE", "w-TKE",
+		"u-TKE, share", "v-TKE, share", "w-TKE, share" };
+	T *energy_structure_vec[7];
+	energy_structure_vec[0] = avg.TKE;
+	energy_structure_vec[1] = avg.u_TKE;
+	energy_structure_vec[2] = avg.v_TKE;
+	energy_structure_vec[3] = avg.w_TKE;
+	energy_structure_vec[4] = avg.u_TKE_share;
+	energy_structure_vec[5] = avg.v_TKE_share;
+	energy_structure_vec[6] = avg.w_TKE_share;
+
+	write_tecplot_1d(stats.output.ENERGY_STRUCTURE_PZ_FILE, index,
+		energy_structure_vec, energy_structure_name, 7, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#endif
+
+	// - output [TKE anisotropy] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* TKE_anisotropy_name[] = {
+		"uu TKE-anisotropy", "vv TKE-anisotropy", "ww TKE-anisotropy",
+		"uv TKE-anisotropy", "uw TKE-anisotropy", "vw TKE-anisotropy" };
+	T *TKE_anisotropy_vec[6];
+	TKE_anisotropy_vec[0] = avg.TKE_aniso_uu;
+	TKE_anisotropy_vec[1] = avg.TKE_aniso_vv;
+	TKE_anisotropy_vec[2] = avg.TKE_aniso_ww;
+	TKE_anisotropy_vec[3] = avg.TKE_aniso_uv;
+	TKE_anisotropy_vec[4] = avg.TKE_aniso_uw;
+	TKE_anisotropy_vec[5] = avg.TKE_aniso_vw;
+
+	write_tecplot_1d(stats.output.TKE_ANISOTROPY_PZ_FILE, index,
+		TKE_anisotropy_vec, TKE_anisotropy_name, 6, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [TKE exchange - pressure-strain] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* TKE_exchange_name[] = {
+		"TKE exchange - balance",
+		"u-TKE exchange [p'du'/dx]",
+		"v-TKE exchange [p'dv'/dy]",
+		"w-TKE exchange [p'dw'/dz]",
+		"uv-pressure-strain [2*p'S'(uv)]",
+		"uw-pressure-strain [2*p'S'(uw)]",
+		"vw-pressure-strain [2*p'S'(vw)]" };
+	T *TKE_exchange_vec[7];
+	TKE_exchange_vec[0] = avg.TKE_exchange_balance;
+	TKE_exchange_vec[1] = avg.u_TKE_exchange;
+	TKE_exchange_vec[2] = avg.v_TKE_exchange;
+	TKE_exchange_vec[3] = avg.w_TKE_exchange;
+	TKE_exchange_vec[4] = avg.P2Suv_turb;
+	TKE_exchange_vec[5] = avg.P2Suw_turb_c;
+	TKE_exchange_vec[6] = avg.P2Svw_turb_c;
+
+	write_tecplot_1d(stats.output.TKE_EXCHANGE_PZ_FILE, index,
+		TKE_exchange_vec, TKE_exchange_name, 7, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [u'v' budget] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* UV_flux_budget_name[] = { "[u'v'] balance",
+		"[u'v'] production", "[u'v'] diffusion", "[u'v'] pressure-strain",
+		"[u'v'] dissipation", "[u'v'] iso-dissipation",
+		"[u'v'] transport", "[u'v'] pressure work",
+		"[u'v'] production (dU/dz)", "[u'v'] production (dV/dz)" };
+	T *UV_flux_budget_vec[10];
+	UV_flux_budget_vec[0] = avg.uv_budget_balance;
+	UV_flux_budget_vec[1] = avg.uv_production;
+	UV_flux_budget_vec[2] = avg.uv_diffusion;
+	UV_flux_budget_vec[3] = avg.P2Suv_turb;
+	UV_flux_budget_vec[4] = avg.uv_dissipation;
+	UV_flux_budget_vec[5] = avg.uv_iso_dissipation;
+	UV_flux_budget_vec[6] = avg.uv_transport;
+	UV_flux_budget_vec[7] = avg.uv_pressure_work;
+	UV_flux_budget_vec[8] = avg.uv_production_shearU;
+	UV_flux_budget_vec[9] = avg.uv_production_shearV;
+
+	write_tecplot_1d(stats.output.UV_FLUX_BUDGET_PZ_FILE, index,
+		UV_flux_budget_vec, UV_flux_budget_name, 10, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#ifdef STRATIFICATION
+	// - output [u'w' budget] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* UW_flux_budget_name[] = { "[u'w'] balance",
+		"[u'w'] production", "[u'w'] diffusion", "[u'w'] pressure-strain",
+		"[u'w'] buoyancy",
+		"[u'w'] dissipation", "[u'w'] iso-dissipation",
+		"[u'w'] transport", "[u'w'] pressure work",
+		"[u'w'] production (dU/dz)", "[u'w'] production (dW/dz)" };
+	T *UW_flux_budget_vec[11];
+	UW_flux_budget_vec[0] = avg.uw_budget_balance;
+	UW_flux_budget_vec[1] = avg.uw_production;
+	UW_flux_budget_vec[2] = avg.uw_diffusion;
+	UW_flux_budget_vec[3] = avg.P2Suw_turb;
+	UW_flux_budget_vec[4] = avg.uw_buoyancy;
+	UW_flux_budget_vec[5] = avg.uw_dissipation;
+	UW_flux_budget_vec[6] = avg.uw_iso_dissipation;
+	UW_flux_budget_vec[7] = avg.uw_transport;
+	UW_flux_budget_vec[8] = avg.uw_pressure_work;
+	UW_flux_budget_vec[9] = avg.uw_production_shearU;
+	UW_flux_budget_vec[10] = avg.uw_production_shearW;
+
+	write_tecplot_1d(stats.output.UW_FLUX_BUDGET_PZ_FILE, index,
+		UW_flux_budget_vec, UW_flux_budget_name, 11, axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [v'w' budget] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* VW_flux_budget_name[] = { "[v'w'] balance",
+		"[v'w'] production", "[v'w'] diffusion", "[v'w'] pressure-strain",
+		"[v'w'] buoyancy",
+		"[v'w'] dissipation", "[v'w'] iso-dissipation",
+		"[v'w'] transport", "[v'w'] pressure work",
+		"[v'w'] production (dV/dz)", "[v'w'] production (dW/dz)" };
+	T *VW_flux_budget_vec[11];
+	VW_flux_budget_vec[0] = avg.vw_budget_balance;
+	VW_flux_budget_vec[1] = avg.vw_production;
+	VW_flux_budget_vec[2] = avg.vw_diffusion;
+	VW_flux_budget_vec[3] = avg.P2Svw_turb;
+	VW_flux_budget_vec[4] = avg.vw_buoyancy;
+	VW_flux_budget_vec[5] = avg.vw_dissipation;
+	VW_flux_budget_vec[6] = avg.vw_iso_dissipation;
+	VW_flux_budget_vec[7] = avg.vw_transport;
+	VW_flux_budget_vec[8] = avg.vw_pressure_work;
+	VW_flux_budget_vec[9] = avg.vw_production_shearV;
+	VW_flux_budget_vec[10] = avg.vw_production_shearW;
+
+	write_tecplot_1d(stats.output.VW_FLUX_BUDGET_PZ_FILE, index,
+		VW_flux_budget_vec, VW_flux_budget_name, 11, axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#else
+
+	// - output [u'w' budget] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* UW_flux_budget_name[] = { "[u'w'] balance",
+		"[u'w'] production", "[u'w'] diffusion", "[u'w'] pressure-strain",
+		"[u'w'] dissipation", "[u'w'] iso-dissipation",
+		"[u'w'] transport", "[u'w'] pressure work",
+		"[u'w'] production (dU/dz)", "[u'w'] production (dW/dz)" };
+	T *UW_flux_budget_vec[10];
+	UW_flux_budget_vec[0] = avg.uw_budget_balance;
+	UW_flux_budget_vec[1] = avg.uw_production;
+	UW_flux_budget_vec[2] = avg.uw_diffusion;
+	UW_flux_budget_vec[3] = avg.P2Suw_turb;
+	UW_flux_budget_vec[4] = avg.uw_dissipation;
+	UW_flux_budget_vec[5] = avg.uw_iso_dissipation;
+	UW_flux_budget_vec[6] = avg.uw_transport;
+	UW_flux_budget_vec[7] = avg.uw_pressure_work;
+	UW_flux_budget_vec[8] = avg.uw_production_shearU;
+	UW_flux_budget_vec[9] = avg.uw_production_shearW;
+
+	write_tecplot_1d(stats.output.UW_FLUX_BUDGET_PZ_FILE, index,
+		UW_flux_budget_vec, UW_flux_budget_name, 10, axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [v'w' budget] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* VW_flux_budget_name[] = { "[v'w'] balance",
+		"[v'w'] production", "[v'w'] diffusion", "[v'w'] pressure-strain",
+		"[v'w'] dissipation", "[v'w'] iso-dissipation",
+		"[v'w'] transport", "[v'w'] pressure work",
+		"[v'w'] production (dV/dz)", "[v'w'] production (dW/dz)" };
+	T *VW_flux_budget_vec[10];
+	VW_flux_budget_vec[0] = avg.vw_budget_balance;
+	VW_flux_budget_vec[1] = avg.vw_production;
+	VW_flux_budget_vec[2] = avg.vw_diffusion;
+	VW_flux_budget_vec[3] = avg.P2Svw_turb;
+	VW_flux_budget_vec[4] = avg.vw_dissipation;
+	VW_flux_budget_vec[5] = avg.vw_iso_dissipation;
+	VW_flux_budget_vec[6] = avg.vw_transport;
+	VW_flux_budget_vec[7] = avg.vw_pressure_work;
+	VW_flux_budget_vec[8] = avg.vw_production_shearV;
+	VW_flux_budget_vec[9] = avg.vw_production_shearW;
+
+	write_tecplot_1d(stats.output.VW_FLUX_BUDGET_PZ_FILE, index,
+		VW_flux_budget_vec, VW_flux_budget_name, 10, axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#endif
+
+
+#ifdef STRATIFICATION
+	// - output [Rotta model] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* Rotta_model_name[] = {
+		"u-Rotta const", "v-Rotta const", "w-Rotta const", "uw-Rotta const",
+		"u-RDT const", "v-RDT const", "w-RDT const", "uw-RDT const",
+		"Rotta-RDT E-const", "Rotta-RDT P-const",
+		"Rotta-buoyancy E-const", "Rotta-buoyancy B-const",
+		"RDT-buoyancy P-const", "RDT-buoyancy B-const",
+		"u-Rotta-TPE const", "v-Rotta-TPE const", "w-Rotta-TPE const" };
+	T *Rotta_model_vec[17];
+	Rotta_model_vec[0] = avg.u_Rotta;
+	Rotta_model_vec[1] = avg.v_Rotta;
+	Rotta_model_vec[2] = avg.w_Rotta;
+	Rotta_model_vec[3] = avg.uw_Rotta;
+	Rotta_model_vec[4] = avg.u_RDT;
+	Rotta_model_vec[5] = avg.v_RDT;
+	Rotta_model_vec[6] = avg.w_RDT;
+	Rotta_model_vec[7] = avg.uw_RDT;
+	Rotta_model_vec[8] = avg.Rotta_RDT_e;
+	Rotta_model_vec[9] = avg.Rotta_RDT_p;
+	Rotta_model_vec[10] = avg.Rotta_buoyancy_e;
+	Rotta_model_vec[11] = avg.Rotta_buoyancy_b;
+	Rotta_model_vec[12] = avg.RDT_buoyancy_p;
+	Rotta_model_vec[13] = avg.RDT_buoyancy_b;
+	Rotta_model_vec[14] = avg.u_Rotta_TPE;
+	Rotta_model_vec[15] = avg.v_Rotta_TPE;
+	Rotta_model_vec[16] = avg.w_Rotta_TPE;
+
+	write_tecplot_1d(stats.output.ROTTA_MODEL_PZ_FILE, index,
+		Rotta_model_vec, Rotta_model_name, 17, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#else
+	// - output [Rotta model] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* Rotta_model_name[] = {
+		"u-Rotta const", "v-Rotta const", "w-Rotta const", "uw-Rotta const",
+		"u-RDT const", "v-RDT const", "w-RDT const", "uw-RDT const",
+		"Rotta-RDT E-const", "Rotta-RDT P-const" };
+	T *Rotta_model_vec[10];
+	Rotta_model_vec[0] = avg.u_Rotta;
+	Rotta_model_vec[1] = avg.v_Rotta;
+	Rotta_model_vec[2] = avg.w_Rotta;
+	Rotta_model_vec[3] = avg.uw_Rotta;
+	Rotta_model_vec[4] = avg.u_RDT;
+	Rotta_model_vec[5] = avg.v_RDT;
+	Rotta_model_vec[6] = avg.w_RDT;
+	Rotta_model_vec[7] = avg.uw_RDT;
+	Rotta_model_vec[8] = avg.Rotta_RDT_e;
+	Rotta_model_vec[9] = avg.Rotta_RDT_p;
+
+	write_tecplot_1d(stats.output.ROTTA_MODEL_PZ_FILE, index,
+		Rotta_model_vec, Rotta_model_name, 10, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#endif
+
+
+#ifdef STRATIFICATION
+	// - output [T(z)] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.T_AVERAGE_PZ_FILE, index,
+		avg.Tc, "T", axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [sqrt(T'T')] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.T_DEVIATION_PZ_FILE, index,
+		avg.T_deviation, "(T'T')^1/2", axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [T'ui'] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.TU_FLUX_PZ_FILE, index,
+		avg.TU_flux, "T'u'", axisZ, nodeC, grid, current_time);
+	write_tecplot_1d(stats.output.TV_FLUX_PZ_FILE, index,
+		avg.TV_flux, "T'v'", axisZ, nodeC, grid, current_time);
+	write_tecplot_1d(stats.output.TW_FLUX_PZ_FILE, index,
+		avg.TW_flux, "T'w'", axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [dT/dz] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.T_GRAD_PZ_FILE, index,
+		avg.T_grad, "dT/dz", axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [T'dp'/dxi] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.T_PRESSURE_GRADIENT_U_PZ_FILE, index,
+		avg.T_dpdx_turb, "T'*dp'/dx", axisZ, nodeC, grid, current_time);
+	write_tecplot_1d(stats.output.T_PRESSURE_GRADIENT_V_PZ_FILE, index,
+		avg.T_dpdy_turb, "T'*dp'/dy", axisZ, nodeC, grid, current_time);
+	write_tecplot_1d(stats.output.T_PRESSURE_GRADIENT_W_PZ_FILE, index,
+		avg.T_dpdz_turb, "T'*dp'/dz", axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+#ifdef COMPUTE_XT_AVERAGES
+	// - output [T(y,z)] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_2d(stats.output.T_AVERAGE_PYZ_FILE, index,
+		avg.Tyz, "T(y,z)", axisYZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [sqrt(T'T') on -yz plane] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_2d(stats.output.T_DEVIATION_PYZ_FILE, index,
+		avg.Tyz_deviation, "(T'T')^1/2[y,z]", axisYZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [T'w' on -yz plane] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_2d(stats.output.TW_FLUX_PYZ_FILE, index,
+		avg.TWyz_flux, "T'w'[y,z]", axisYZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [dT/dz on -yz plane] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_2d(stats.output.T_GRAD_PYZ_FILE, index,
+		avg.Tyz_grad, "dT/dz[y,z]", axisYZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#endif
+
+
+	// - output [heat flux balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* heat_balance_name[] = { "heat balance",
+		"turbulent heat flux", "heat stress" };
+	T *heat_balance_vec[3];
+	heat_balance_vec[0] = avg.heat_balance;
+	heat_balance_vec[1] = avg.turbulent_heat_flux;
+	heat_balance_vec[2] = avg.heat_stress;
+
+	write_tecplot_1d(stats.output.HEAT_BALANCE_PZ_FILE, index,
+		heat_balance_vec, heat_balance_name, 3, axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [temperature variance balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* TVA_balance_name[] = { "TVA balance",
+		"TVA production", "TVA transport",
+		"TVA dissipation", "TVA iso-dissipation" };
+	T* TVA_balance_vec[5];
+	TVA_balance_vec[0] = avg.TVA_balance;
+	TVA_balance_vec[1] = avg.TVA_production;
+	TVA_balance_vec[2] = avg.TVA_transport;
+	TVA_balance_vec[3] = avg.TVA_dissipation;
+	TVA_balance_vec[4] = avg.TVA_iso_dissipation;
+
+	write_tecplot_1d(stats.output.TVA_BALANCE_PZ_FILE, index,
+		TVA_balance_vec, TVA_balance_name, 5, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [TPE balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* TPE_balance_name[] = { "TPE balance",
+		"TPE heat flux", "TPE transport",
+		"TPE dissipation", "TPE iso-dissipation" };
+	T *TPE_balance_vec[5];
+	TPE_balance_vec[0] = avg.TPE_balance;
+	TPE_balance_vec[1] = avg.TPE_heat_flux;
+	TPE_balance_vec[2] = avg.TPE_transport;
+	TPE_balance_vec[3] = avg.TPE_dissipation;
+	TPE_balance_vec[4] = avg.TPE_iso_dissipation;
+
+	write_tecplot_1d(stats.output.TPE_BALANCE_PZ_FILE, index,
+		TPE_balance_vec, TPE_balance_name, 5, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [T'u' balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* TU_flux_budget_name[] = { "[T'u'] balance",
+		"[T'u'] production", "[T'u'] diffusion", "[T'u'] pressure-grad(T)",
+		"[T'u'] dissipation",
+		"[T'u'] transport", "[T'u'] pressure work",
+		"[T'u'] production [dU/dz]", "[T'u'] production [dT/dz]" };
+	T *TU_flux_budget_vec[9];
+	TU_flux_budget_vec[0] = avg.Tu_budget_balance;
+	TU_flux_budget_vec[1] = avg.Tu_production;
+	TU_flux_budget_vec[2] = avg.Tu_diffusion;
+	TU_flux_budget_vec[3] = avg.Tu_pressure_gradT;
+	TU_flux_budget_vec[4] = avg.Tu_dissipation;
+	TU_flux_budget_vec[5] = avg.Tu_transport;
+	TU_flux_budget_vec[6] = avg.Tu_pressure_work;
+	TU_flux_budget_vec[7] = avg.Tu_production_shear;
+	TU_flux_budget_vec[8] = avg.Tu_production_gradT;
+
+	write_tecplot_1d(stats.output.TU_FLUX_BUDGET_PZ_FILE, index,
+		TU_flux_budget_vec, TU_flux_budget_name, 9, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [T'v' balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* TV_flux_budget_name[] = { "[T'v'] balance",
+		"[T'v'] production", "[T'v'] diffusion", "[T'v'] pressure-grad(T)",
+		"[T'v'] dissipation",
+		"[T'v'] transport", "[T'v'] pressure work",
+		"[T'v'] production [dV/dz]", "[T'v'] production [dT/dz]" };
+	T *TV_flux_budget_vec[9];
+	TV_flux_budget_vec[0] = avg.Tv_budget_balance;
+	TV_flux_budget_vec[1] = avg.Tv_production;
+	TV_flux_budget_vec[2] = avg.Tv_diffusion;
+	TV_flux_budget_vec[3] = avg.Tv_pressure_gradT;
+	TV_flux_budget_vec[4] = avg.Tv_dissipation;
+	TV_flux_budget_vec[5] = avg.Tv_transport;
+	TV_flux_budget_vec[6] = avg.Tv_pressure_work;
+	TV_flux_budget_vec[7] = avg.Tv_production_shear;
+	TV_flux_budget_vec[8] = avg.Tv_production_gradT;
+
+	write_tecplot_1d(stats.output.TV_FLUX_BUDGET_PZ_FILE, index,
+		TV_flux_budget_vec, TV_flux_budget_name, 9, axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [T'w' balance] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* TW_flux_budget_name[] = { "[T'w'] balance",
+		"[T'w'] production", "[T'w'] diffusion", "[T'w'] pressure-grad(T)",
+		"[T'w'] buoyancy",
+		"[T'w'] dissipation",
+		"[T'w'] transport", "[T'w'] pressure work",
+		"[T'w'] production [dW/dz]", "[T'w'] production [dT/dz]" };
+	T *TW_flux_budget_vec[10];
+	TW_flux_budget_vec[0] = avg.Tw_budget_balance;
+	TW_flux_budget_vec[1] = avg.Tw_production;
+	TW_flux_budget_vec[2] = avg.Tw_diffusion;
+	TW_flux_budget_vec[3] = avg.Tw_pressure_gradT;
+	TW_flux_budget_vec[4] = avg.Tw_buoyancy;
+	TW_flux_budget_vec[5] = avg.Tw_dissipation;
+	TW_flux_budget_vec[6] = avg.Tw_transport;
+	TW_flux_budget_vec[7] = avg.Tw_pressure_work;
+	TW_flux_budget_vec[8] = avg.Tw_production_shear;
+	TW_flux_budget_vec[9] = avg.Tw_production_gradT;
+
+	write_tecplot_1d(stats.output.TW_FLUX_BUDGET_PZ_FILE, index,
+		TW_flux_budget_vec, TW_flux_budget_name, 10, axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#endif
+
+
+	// - output [turbulent time scale] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.TIME_SCALE_TURBULENT_PZ_FILE, index,
+		avg.time_scale_turbulent, "T, turbulent time scale", axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+	// - output [mixing, Kolmogorov length scales] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.LENGTH_SCALE_MIXING_PZ_FILE, index,
+		avg.length_scale_mixing, "L, mixing length scale", axisZ, nodeW, grid, current_time);
+
+	write_tecplot_1d(stats.output.LENGTH_SCALE_KOLMOGOROV_PZ_FILE, index,
+		avg.length_scale_kolmogorov, "L, Kolmogorov length scale", axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+#ifdef STRATIFICATION
+	// - output [Pr-turbulent, Ri-gradient, Ri-flux, Re-buoyancy, Fr-horizontal] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.PRANDTL_TURBULENT_PZ_FILE, index,
+		avg.Prandtl_turbulent, "Pr-turbulent", axisZ, nodeW, grid, current_time);
+
+	write_tecplot_1d(stats.output.RICHARDSON_GRADIENT_PZ_FILE, index,
+		avg.Richardson_gradient, "Ri-gradient", axisZ, nodeW, grid, current_time);
+#ifdef COMPUTE_XT_AVERAGES
+	write_tecplot_2d(stats.output.RICHARDSON_GRADIENT_PYZ_FILE, index,
+		avg.Richardson_gradient_yz, "Ri-gradient(y,z)", axisYZ, nodeW, grid, current_time);
+#endif
+
+	write_tecplot_1d(stats.output.RICHARDSON_FLUX_PZ_FILE, index,
+		avg.Richardson_flux, "Ri-flux", axisZ, nodeW, grid, current_time);
+
+	write_tecplot_1d(stats.output.REYNOLDS_BUOYANCY_PZ_FILE, index,
+		avg.Reynolds_buoyancy, "Re-buoyancy", axisZ, nodeC, grid, current_time);
+
+	write_tecplot_1d(stats.output.FROUDE_HORIZONTAL_PZ_FILE, index,
+		avg.Froude_horizontal, "Fr-horizontal", axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [temperature variance time scale] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.TIME_SCALE_TVARIANCE_PZ_FILE, index,
+		avg.time_scale_Tvariance, "T, T-variance time scale", axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [Ellison, Ozmidov, Obukhov length scales] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.LENGTH_SCALE_ELLISON_PZ_FILE, index,
+		avg.length_scale_ellison, "L, Ellison length scale", axisZ, nodeW, grid, current_time);
+
+	write_tecplot_1d(stats.output.LENGTH_SCALE_OZMIDOV_PZ_FILE, index,
+		avg.length_scale_ozmidov, "L, Ozmidov length scale", axisZ, nodeC, grid, current_time);
+
+	write_tecplot_1d(stats.output.LENGTH_SCALE_OBUKHOV_PZ_FILE, index,
+		avg.length_scale_obukhov, "L, Obukhov length scale", axisZ, nodeW, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [mixing efficiency] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.MIXING_EFFICIENCY_PZ_FILE, index,
+		avg.mixing_efficiency, "mixing efficiency", axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+
+	// - output [turbulence production ratio] ...
+	// ---------------------------------------------------------------------------------- //
+	write_tecplot_1d(stats.output.TURB_PRODUCTION_RATIO_PZ_FILE, index,
+		avg.turb_production_ratio, "TKE-TVA production ratio", axisZ, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+
+#endif
+
+
+#ifdef FOURIER_SPECTRUM
+	// - output [energy spectrum] ...
+	// ---------------------------------------------------------------------------------- //
+	const char* DFT_name[] = { "U^2 power spectrum", "V^2 power spectrum", "W^2 power spectrum",
+		"E power spectrum", "wavenumber" };
+	T *DFT_vec[5];
+	DFT_vec[0] = spectrum.DFT_U2_x[0];
+	DFT_vec[1] = spectrum.DFT_U2_x[1];
+	DFT_vec[2] = spectrum.DFT_U2_x[2];
+	DFT_vec[3] = spectrum.DFT_U2_x[3];
+	DFT_vec[4] = spectrum.DFT_wavenumber_x;
+
+	write_tecplot_1d(stats.output.FOURIER_FILE, output.index,
+		DFT_vec, DFT_name, 5, axisX, nodeC, grid, current_time);
+	// ---------------------------------------------------------------------------------- //
+#endif
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/model-user.hpp b/model-user.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6d2b9956ead3bc542dcf816a35bc78cd2577258f
--- /dev/null
+++ b/model-user.hpp
@@ -0,0 +1,55 @@
+#include "model-obj.h"
+
+// ------------------------------- //
+// User post processing            //
+// ------------------------------- //
+
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::user_post_processing()
+{
+	// Additional subroutine for flow-processing at the end of each time step
+	// ----------------------------------------------------------------------------------------
+	// >
+	// > Grid:
+	// >	grid.nx, grid.ny, grid.nz - MPI local grid size dimensions
+	// >	grid.gcx, grid.gcy, grid.gcz - number of ghost cells
+	// >		number of MPI local inner grid cells = grid.nx[y,z] - 2 * grid.gcx[y,z]
+	// >	grid.dx, grid.dy - grid steps in -x, -y directions
+	// >	grid.dz - grid steps in -z directions, array of size = grid.nz
+	// >	* check [grid3d.h, wstgrid3d.h] for grid data declaration
+	// >
+	// > Time:
+	// >	current_time - current integration time mark
+	// >	dt - time step
+	// > 
+	// > Arrays:
+	// >	size of main model arrays = grid.size = (grid.nx * grid.ny * grid.nz)
+	// >	* check [model-obj.h] for model data variables and arrays declaration
+	// >		e.g.: U, V, W, Pressure - velocity components and pressure fields
+	// >		e.g.: Tx - temperature field
+	// >	* check [nse3d.cpp] for implementation of array operations on grid
+	// >
+	// > Flow:
+	// >    flow parameters are declared in struct [fluid] in [model-obj.h]
+	// >
+	// > I/O:
+	// >	* check [nse-io3d.h] for input-output subroutines definitions
+	// >
+	// ----------------------------------------------------------------------------------------
+}
+
+template< typename T, nse::memType mem >
+void nse::modelObj<T, mem>::user_output()
+{
+	// Additional subroutine for adding user output to regular output
+	// ----------------------------------------------------------------------------------------
+	// >
+	// > Output setup is controlled by configuration file
+	// > Example for writing 3D array defined at the center of grid cells [nodeC] in tecplot format:
+	// >	write_tecplot_3d("some_file_name.plt", some_index,
+	// >		some_array, "some_array",
+	// >		nodeC, grid, current_time);
+	// >
+	// ----------------------------------------------------------------------------------------
+
+}
diff --git a/mpi-com.h b/mpi-com.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbed09c459e10a69b6433d287d01050e24376153
--- /dev/null
+++ b/mpi-com.h
@@ -0,0 +1,791 @@
+#pragma once
+
+#include <mpi.h>
+#include <omp.h>
+
+#include "nse-alloc.h"
+#include "mem-stx.h"
+
+namespace nse
+{
+	// * MPI reduction operators * //
+	template< typename T >
+	T mpi_allreduce(const T in, MPI_Op operation);
+	template< typename T >
+	T mpi_allreduce(const T in, MPI_Op operation, const MPI_Comm comm);
+
+	template< typename T >
+	void mpi_allreduce(T* x, MPI_Op operation);
+	template< typename T >
+	void mpi_allreduce(T* x, MPI_Op operation, const MPI_Comm comm);
+
+	template< typename T >
+	void mpi_allreduce(T* x, T* y, MPI_Op operation);
+	template< typename T >
+	void mpi_allreduce(T* x, T* y, MPI_Op operation, const MPI_Comm comm);
+
+	template< typename T >
+	void mpi_allreduce(T* x, T* y, T* z, MPI_Op operation);
+	template< typename T >
+	void mpi_allreduce(T* x, T* y, T* z, MPI_Op operation, const MPI_Comm comm);
+
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void mpi_allreduce_vec(const T* in, T* out, const int n,
+		MPI_Op operation);
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void mpi_allreduce_vec(const T* in, T* out, const int n,
+		MPI_Op operation, const MPI_Comm comm);
+
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void mpi_scan_vec(const T* in, T* out, const int n,
+		MPI_Op operation);
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void mpi_scan_vec(const T* in, T* out, const int n,
+		MPI_Op operation, const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	void mpi_allreduce_vec(T* inout, const int n,
+		MPI_Op operation);
+	template< memType mem = memCPU, typename T >
+	void mpi_allreduce_vec(T* inout, const int n,
+		MPI_Op operation, const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	void mpi_broadcast(T* x, const int n, const int host);
+	template< memType mem = memCPU, typename T >
+	void mpi_broadcast(T* x, const int n, const int host, const MPI_Comm comm);
+
+	void mpi_broadcast(bool* x, const int n, const int host);
+	void mpi_broadcast(bool* x, const int n, const int host, const MPI_Comm comm);
+
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void mpi_gather(const T* in, const int n, T* out, const int host);
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void mpi_gather(const T* in, const int n, 
+		T* out, const int host, const MPI_Comm comm);
+
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void mpi_gather_vec(const T* in, const int nsend,
+		T* out, const int* nrecv, const int* displs, const int host);
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void mpi_gather_vec(const T* in, const int nsend,
+		T* out, const int* nrecv, const int* displs, const int host, const MPI_Comm comm);
+
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void mpi_gather_vec(const T* in, const int size,
+		T* out, const int host);
+	template< memType memIN = memCPU, memType memOUT = memCPU, typename T >
+	void mpi_gather_vec(const T* in, const int size,
+		T* out, const int host, const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	void mpi_send(const T* x, const int n,
+		const int dest, int tag);
+	template< memType mem = memCPU, typename T >
+	void mpi_send(const T* x, const int n,
+		const int dest, int tag, const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	void mpi_recv(T* x, const int n,
+		const int source, const int tag, MPI_Status* status);
+	template< memType mem = memCPU, typename T >
+	void mpi_recv(T* x, const int n,
+		const int source, const int tag, const MPI_Comm comm, MPI_Status* status);
+
+
+	// * MPI data type wrapper  * //
+	template< typename T >
+	MPI_Datatype mpi_type();
+
+	// * MPI topology factorization * //
+	// size = mx * my factorization (mx >= my)
+	void mpi_com_dims(int size, int* mx, int* my);
+	// size = mx * my * mz factorization (mx >= my >= mz)
+	void mpi_com_dims(int size, int* mx, int* my, int* mz);
+
+	// * MPI decomposition: size (excluding ghost cells) * //
+	int par_local_size(const int mpi_cx, const int rank_x, const int size_x);
+	int par_local_size_comm(const int mpi_cx, const MPI_Comm comm_x);
+	// * MPI decomposition: size (including ghost cells) * //
+	int par_local_size(const int mpi_nx, const int gcx, const int rank_x, const int size_x);
+	int par_local_size_comm(const int mpi_nx, const int gcx, const MPI_Comm comm_x);
+	// * MPI decomposition: offset (including overlapping ghost cells) * //
+	int par_local_offset(const int mpi_nx, const int gcx, const int rank_x, const int size_x);
+	int par_local_offset_comm(const int mpi_nx, const int gcx, const MPI_Comm comm_x);
+
+	// * MPI-I/O datatype * //
+	template< typename T, int ndim >	// assumed size of arrays: [ndim] //
+	void mpi_io_write_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+		const int mpi_dim_size[], const int dim_size[], const int nghost[],
+		const MPI_Comm comm[]);	
+
+	template< typename T, int ndim >	// assumed size of arrays: [ndim] //
+	void mpi_io_read_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+		const int mpi_dim_size[], const int dim_size[], const int nghost[],
+		const MPI_Comm comm[]);	
+
+	// * Timer calls * //
+	double timer_init();
+	void timer_update(const double mark, double* timer);
+	void timer_update(const double mark, double* timer_a, double* timer_b);
+}
+
+
+// * MPI reduction operators * //
+// --------------------------- //
+template< typename T >
+inline T nse::mpi_allreduce( // single element reduction
+	const T in, MPI_Op operation)
+{
+	return mpi_allreduce(in, operation, MPI_COMM_WORLD);
+}
+
+template< typename T >
+inline T nse::mpi_allreduce( // single element reduction
+	const T in, MPI_Op operation, const MPI_Comm comm)
+{
+	T mpi_in = in, mpi_out;
+	MPI_Allreduce(&mpi_in, &mpi_out, 1,
+		mpi_type< T >(), operation, comm);
+
+	return mpi_out;
+}
+
+template< typename T >
+inline void nse::mpi_allreduce( // single element reduction
+	T* x, MPI_Op operation)
+{
+	mpi_allreduce(x, operation, MPI_COMM_WORLD);
+}
+
+template< typename T >
+inline void nse::mpi_allreduce( // single element reduction
+	T* x, MPI_Op operation, const MPI_Comm comm)
+{
+	T in = (*x);
+	MPI_Allreduce(&in, x, 1,
+		mpi_type< T >(), operation, comm);
+}
+
+template< typename T >
+inline void nse::mpi_allreduce( // double element reduction
+	T* x, T* y, MPI_Op operation)
+{
+	mpi_allreduce(x, y, operation, MPI_COMM_WORLD);
+}
+
+template< typename T >
+inline void nse::mpi_allreduce( // double element reduction
+	T* x, T* y, MPI_Op operation, const MPI_Comm comm)
+{
+	T in[2], out[2];
+
+	in[0] = (*x);
+	in[1] = (*y);
+
+	MPI_Allreduce(in, out, 2,
+		mpi_type< T >(), operation, comm);
+
+	(*x) = out[0];
+	(*y) = out[1];
+}
+
+template< typename T >
+inline void nse::mpi_allreduce( // triple element reduction
+	T* x, T* y, T* z, MPI_Op operation)
+{
+	mpi_allreduce(x, y, z, operation, MPI_COMM_WORLD);
+}
+
+template< typename T >
+inline void nse::mpi_allreduce( // triple element reduction
+	T* x, T* y, T* z, MPI_Op operation, const MPI_Comm comm)
+{
+	T in[3], out[3];
+
+	in[0] = (*x);
+	in[1] = (*y);
+	in[2] = (*z);
+
+	MPI_Allreduce(in, out, 3,
+		mpi_type< T >(), operation, comm);
+
+	(*x) = out[0];
+	(*y) = out[1];
+	(*z) = out[2];
+}
+
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::mpi_allreduce_vec(
+	const T* in, T* out, const int n, MPI_Op operation)
+{
+	mpi_allreduce_vec<memIN, memOUT>(in, out, n, operation, MPI_COMM_WORLD);
+}
+
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::mpi_allreduce_vec(
+	const T* in, T* out, const int n, MPI_Op operation, const MPI_Comm comm)
+{
+	if ((memIN == memCPU) && (memOUT == memCPU))
+	{
+		MPI_Allreduce((void*)in, out, n, mpi_type< T >(), operation, comm);
+		return;
+	}
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if (memIN == memGPU) {
+		T* buf;
+		int buf_id = memStx::get_buf(&buf, n);
+
+		mcopy<memCPU, memGPU>(buf, in, n);
+		mpi_allreduce_vec<memCPU, memOUT>(buf, out, n, operation, comm);
+
+		memStx::free_buf(buf_id);
+		return;
+	}
+
+	if (memOUT == memGPU) {
+		T* buf;
+		int buf_id = memStx::get_buf(&buf, n);
+
+		mpi_allreduce_vec<memIN, memCPU>(in, buf, n, operation, comm);
+		mcopy<memGPU, memCPU>(out, buf, n);
+
+		memStx::free_buf(buf_id);
+		return;
+	}
+#endif
+}
+
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::mpi_scan_vec(
+	const T* in, T* out, const int n, MPI_Op operation)
+{
+	mpi_scan_vec<memIN, memOUT>(in, out, n, operation, MPI_COMM_WORLD);
+}
+
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::mpi_scan_vec(
+	const T* in, T* out, const int n, MPI_Op operation, const MPI_Comm comm)
+{
+	if ((memIN == memCPU) && (memOUT == memCPU))
+	{
+		MPI_Scan((void*)in, out, n, mpi_type< T >(), operation, comm);
+	}
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if ((memIN == memCPU) && (memOUT == memGPU)) {
+		T* buf;
+		int buf_id = memStx::get_buf(&buf, n);
+
+		mpi_scan_vec<memCPU, memCPU>(in, buf, n, operation, comm);
+		mcopy<memGPU, memCPU>(out, buf, n);
+
+		memStx::free_buf(buf_id);
+	}
+	if (memIN == memGPU) {	// memOUT = [memCPU, memGPU]
+		T* buf;
+		int buf_id = memStx::get_buf(&buf, n);
+
+		mcopy<memCPU, memGPU>(buf, in, n);
+		mpi_scan_vec<memCPU, memOUT>(buf, out, n, operation, comm);
+
+		memStx::free_buf(buf_id);
+	}
+#endif
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_allreduce_vec(
+	T* inout, const int n, MPI_Op operation)
+{
+	mpi_allreduce_vec<mem>(inout, n, operation, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_allreduce_vec(
+	T* inout, const int n, MPI_Op operation, const MPI_Comm comm)
+{
+#ifdef USE_MPI_ALLREDUCE_IN_PLACE
+	if (mem == memCPU) {
+		MPI_Allreduce(MPI_IN_PLACE, inout, n,
+			mpi_type< T >(), operation, comm);
+	}
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) {
+		T* buf;
+		int buf_id = memStx::get_buf(&buf, n);
+
+		mcopy<memCPU, memGPU>(buf, inout, n);
+		mpi_allreduce_vec<memCPU>(buf, n, operation, comm);
+		mcopy<memGPU, memCPU>(inout, buf, n);
+
+		memStx::free_buf(buf_id);
+	}
+#endif
+#else
+	T* buf;
+	int buf_id = memStx::get_buf(&buf, n);
+
+	mpi_allreduce_vec<mem, memCPU>(inout, buf, n, operation, comm);
+	mcopy<mem, memCPU>(inout, buf, n);
+
+	memStx::free_buf(buf_id);
+#endif
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_broadcast(T* x, const int n, const int host)
+{
+	mpi_broadcast<mem>(x, n, host, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_broadcast(T* x, const int n, const int host, const MPI_Comm comm)
+{
+	if (mem == memCPU) MPI_Bcast(x, n, mpi_type< T >(), host, comm);
+
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) {
+
+		T* buf;
+		int buf_id = memStx::get_buf(&buf, n);
+
+		int comm_rank;
+		MPI_Comm_rank(comm, &comm_rank);
+		if (comm_rank == host) mcopy<memCPU, mem>(buf, x, n);
+
+		mpi_broadcast<memCPU>(buf, n, host, comm);
+		mcopy<mem, memCPU>(x, buf, n);
+
+		memStx::free_buf(buf_id);
+	}
+#endif
+}
+
+inline void nse::mpi_broadcast(bool* x, const int n, const int host)
+{
+	mpi_broadcast(x, n, host, MPI_COMM_WORLD);
+}
+
+inline void nse::mpi_broadcast(bool* x, const int n, const int host, const MPI_Comm comm)
+{
+	int* buf;
+	int buf_id = memStx::get_buf(&buf, n);
+
+	int comm_rank;
+	MPI_Comm_rank(comm, &comm_rank);
+	if (comm_rank == host) {
+		for (int k = 0; k < n; k++)
+			if (x[k]) buf[k] = 1;
+			else
+				buf[k] = 0;
+	}
+
+	mpi_broadcast<memCPU>(buf, n, host, comm);
+	if (comm_rank != host) {
+		for (int k = 0; k < n; k++)
+			if (buf[k] == 1) x[k] = true;
+			else
+				x[k] = false;
+	}
+
+	memStx::free_buf(buf_id);
+}
+
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::mpi_gather(const T* in, const int n,
+	T* out, const int host)
+{
+	mpi_gather(in, n, out, host, MPI_COMM_WORLD);
+}
+
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::mpi_gather(const T* in, const int n,
+	T* out, const int host, const MPI_Comm comm)
+{
+	if ((memIN == memCPU) && (memOUT == memCPU))
+	{
+		MPI_Gather((void*)in, n, mpi_type< T >(),
+			out, n, mpi_type< T >(), host, comm);
+		return;
+	}
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if (memIN == memGPU) {
+		T *buf;
+		int buf_id = memStx::get_buf(&buf, n);
+
+		mcopy<memCPU, memGPU>(buf, in, n);
+		mpi_gather<memCPU, memOUT>(buf, n,
+			out, host, comm);
+
+		memStx::free_buf(buf_id);
+		return;
+	}
+	if (memOUT == memGPU) {
+		T *buf;
+		int buf_id;
+
+		int comm_rank, comm_size;
+		MPI_Comm_rank(comm, &comm_rank);
+		MPI_Comm_size(comm, &comm_size);
+		
+		if (comm_rank == host) buf_id = memStx::get_buf(&buf, n);
+
+		mpi_gather<memIN, memCPU>(in, n,
+			buf, host, comm);
+
+		if (comm_rank == host) {
+			mcopy<memGPU, memCPU>(out, buf, n);
+			memStx::free_buf(buf_id);
+		}
+		return;
+	}
+#endif
+
+}
+
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::mpi_gather_vec(const T* in, const int nsend,
+	T* out, const int* nrecv, const int* displs, const int host)
+{
+	mpi_gather_vec<memIN, memOUT>(in, nsend, out, nrecv, displs,
+		host, MPI_COMM_WORLD);
+}
+
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::mpi_gather_vec(const T* in, const int nsend,
+	T* out, const int* nrecv, const int* displs,
+	const int host, const MPI_Comm comm)
+{
+	if ((memIN == memCPU) && (memOUT == memCPU))
+	{
+		MPI_Gatherv((void*)in, nsend, mpi_type<T>(),
+			out, (int*)nrecv, (int*)displs, mpi_type<T>(), host, comm);
+		return;
+	}
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if (memIN == memGPU) {
+		T* buf;
+		int buf_id = memStx::get_buf(&buf, nsend);
+
+		mcopy<memCPU, memGPU>(buf, in, nsend);
+		mpi_gather_vec<memCPU, memOUT>(buf, nsend,
+			out, nrecv, displs, host, comm);
+
+		memStx::free_buf(buf_id);
+		return;
+	}
+
+	if (memOUT == memGPU) {
+		int comm_rank, comm_size;
+		MPI_Comm_rank(comm, &comm_rank);
+		MPI_Comm_size(comm, &comm_size);
+
+		T* buf;
+		int buf_id, nout;
+
+		if (comm_rank == host) {
+			nout = 0;
+			for (int k = 0; k < comm_size; k++)
+				nout += nrecv[k];
+
+			buf_id = memStx::get_buf(&buf, nout);
+		}
+
+		mpi_gather_vec<memIN, memCPU>(in, nsend,
+			buf, nrecv, displs, host, comm);
+
+		if (comm_rank == host) {
+			mcopy<memGPU, memCPU>(out, buf, nout);
+			memStx::free_buf(buf_id);
+		}
+		return;
+	}
+#endif
+}
+
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::mpi_gather_vec(const T* in, const int size,
+	T* out, const int host)
+{
+	mpi_gather_vec<memIN, memOUT>(in, size, out, host, MPI_COMM_WORLD);
+}
+
+template< nse::memType memIN, nse::memType memOUT, typename T >
+inline void nse::mpi_gather_vec(const T* in, const int size,
+	T* out, const int host, const MPI_Comm comm)
+{
+	int *nsize, *ndisp;
+	int buf_id[2];
+	int comm_rank, comm_size;
+
+	MPI_Comm_rank(comm, &comm_rank);
+	MPI_Comm_size(comm, &comm_size);
+
+	if (comm_rank == host) {
+		buf_id[0] = memStx::get_buf(&nsize, comm_size);
+		buf_id[1] = memStx::get_buf(&ndisp, comm_size);
+	}
+
+	int np = size;
+	mpi_gather(&np, 1, nsize, host, comm);
+
+	if (comm_rank == host) {
+
+		int nall = 0;
+		for (int i = 0; i < comm_size; i++)
+			nall += nsize[i];
+		ndisp[0] = 0;
+		for (int i = 1; i < comm_size; i++)
+			ndisp[i] = ndisp[i - 1] + nsize[i - 1];
+	}
+
+	mpi_gather_vec<memIN, memOUT>(in, size, out, nsize, ndisp, host, comm);
+
+	if (comm_rank == host) {
+		memStx::free_buf(buf_id[0]);
+		memStx::free_buf(buf_id[1]);
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_send(const T* x, const int n,
+	const int dest, int tag)
+{
+	mpi_send<mem>(x, n, dest, tag, MPI_COMM_WORLD);
+}
+template< nse::memType mem, typename T >
+inline void nse::mpi_send(const T* x, const int n,
+	const int dest, int tag, const MPI_Comm comm)
+{
+	if (mem == memCPU)
+		MPI_Send((void*)x, n, mpi_type<T>(), dest, tag, comm);
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) {
+		T* buf;
+		int buf_id = memStx::get_buf(&buf, n);
+
+		mcopy<memCPU, mem>(buf, x, n);
+		mpi_send<memCPU>(buf, n, dest, tag, comm);
+
+		memStx::free_buf(buf_id);
+	}
+#endif	
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_recv(T* x, const int n,
+	const int source, const int tag, MPI_Status* status)
+{
+	mpi_recv<mem>(x, n, source, tag, MPI_COMM_WORLD, status);
+}
+template< nse::memType mem, typename T >
+inline void nse::mpi_recv(T* x, const int n,
+	const int source, const int tag, const MPI_Comm comm, MPI_Status* status)
+{
+	if (mem == memCPU)
+		MPI_Recv(x, n, mpi_type<T>(), source, tag, comm, status);
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) {
+		T* buf;
+		int buf_id = memStx::get_buf(&buf, n);
+
+		mpi_recv<memCPU>(buf, n, source, tag, comm, status);
+		mcopy<mem, memCPU>(x, buf, n);
+
+		memStx::free_buf(buf_id);
+	}
+#endif
+}
+
+// * MPI data type wrapper * //
+// ------------------------- //
+namespace nse
+{
+	template< > inline MPI_Datatype mpi_type< float >() { return MPI_FLOAT; }
+	template< > inline MPI_Datatype mpi_type< double >() { return MPI_DOUBLE; }
+	template< > inline MPI_Datatype mpi_type< int >() { return MPI_INT; }
+	template< > inline MPI_Datatype mpi_type< char >() { return MPI_CHAR; }
+	template< > inline MPI_Datatype mpi_type< unsigned char >() { return MPI_UNSIGNED_CHAR; }
+}
+
+// * MPI topology factorization * //
+// ------------------------------ //
+inline void nse::mpi_com_dims(const int size, int* mx, int* my)
+{
+	int dims[2];
+	dims[0] = 0; dims[1] = 0;
+
+	MPI_Dims_create(size, 2, dims);
+	(*mx) = dims[0];
+	(*my) = dims[1];
+}
+
+inline void nse::mpi_com_dims(const int size, int* mx, int* my, int* mz)
+{
+	int dims[3];
+	dims[0] = 0; dims[1] = 0; dims[2] = 0;
+
+	MPI_Dims_create(size, 3, dims);
+	(*mx) = dims[0];
+	(*my) = dims[1];
+	(*mz) = dims[2];
+}
+
+// * MPI decomposition: size (including ghost cells) * //
+// --------------------------------------------------- //
+inline int nse::par_local_size(
+	const int mpi_nx, const int gcx, const int rank_x, const int size_x)
+{
+	return par_local_size(mpi_nx - 2 * gcx, rank_x, size_x) + 2 * gcx;
+}
+inline int nse::par_local_size_comm(
+	const int mpi_nx, const int gcx, const MPI_Comm comm_x)
+{
+	int rank_x, size_x;
+	MPI_Comm_rank(comm_x, &rank_x);
+	MPI_Comm_size(comm_x, &size_x);
+
+	return par_local_size(mpi_nx, gcx, rank_x, size_x);
+}
+
+// * MPI decomposition: size (excluding ghost cells) * //
+// --------------------------------------------------- //
+inline int nse::par_local_size(
+	const int mpi_cx, const int rank_x, const int size_x)
+{
+	int cx = mpi_cx / size_x;
+	if (rank_x < (mpi_cx % size_x)) cx++;
+
+	return cx;
+}
+inline int nse::par_local_size_comm(
+	const int mpi_cx, const MPI_Comm comm_x)
+{
+	int rank_x, size_x;
+	MPI_Comm_rank(comm_x, &rank_x);
+	MPI_Comm_size(comm_x, &size_x);
+
+	return par_local_size(mpi_cx, rank_x, size_x);
+}
+
+// * MPI decomposition: offset (including overlapping ghost cells) * //
+// ----------------------------------------------------------------- //
+inline int nse::par_local_offset(
+	const int mpi_nx, const int gcx, const int rank_x, const int size_x)
+{
+	const int mpi_cx = mpi_nx - 2 * gcx;
+	const int cx = mpi_cx / size_x;
+
+	int offset = 0;
+	for (int i = 0; i < rank_x; i++) {
+		offset += cx;
+		if (i < (mpi_cx % size_x)) offset++;
+	}
+
+	return offset;
+}
+inline int nse::par_local_offset_comm(
+	const int mpi_nx, const int gcx, const MPI_Comm comm_x)
+{
+	int rank_x, size_x;
+	MPI_Comm_rank(comm_x, &rank_x);
+	MPI_Comm_size(comm_x, &size_x);
+
+	return par_local_offset(mpi_nx, gcx, rank_x, size_x);
+}
+
+// * MPI-I/O datatype - write * //
+// ---------------------------- //
+template< typename T, int ndim >
+inline void nse::mpi_io_write_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+	const int mpi_dim_size[], const int dim_size[], const int nghost[],
+	const MPI_Comm comm[])
+{
+	int sub_size[ndim], mpi_pos[ndim], pos[ndim];
+
+	int comm_rank, comm_size, shm, shp;
+	for (int k = 0; k < ndim; k++) {
+		MPI_Comm_rank(comm[k], &comm_rank);
+		MPI_Comm_size(comm[k], &comm_size);
+
+		shm = (comm_rank == 0) ? nghost[k] : 0;
+		shp = (comm_rank == comm_size - 1) ? nghost[k] : 0;
+
+		// sub-size matching for (file,array) datatype pair //
+		sub_size[k] = dim_size[k] - 2 * nghost[k] + shm + shp;
+
+		pos[k] = nghost[k] - shm;
+		mpi_pos[k] = par_local_offset(mpi_dim_size[k], nghost[k],
+			comm_rank, comm_size) + nghost[k] - shm;
+	}
+
+	// File-type: global array //
+	MPI_Type_create_subarray(ndim, (int*)mpi_dim_size, sub_size, mpi_pos,
+		MPI_ORDER_C, mpi_type< T >(), file_view);
+	MPI_Type_commit(file_view);
+
+	// Local-array: excluding ghost cells //
+	MPI_Type_create_subarray(ndim, (int*)dim_size, sub_size, pos,
+		MPI_ORDER_C, mpi_type< T >(), local_view);
+	MPI_Type_commit(local_view);
+}
+
+// * MPI-I/O datatype - read * //
+// --------------------------- //
+template< typename T, int ndim >
+inline void nse::mpi_io_read_datatype(MPI_Datatype* file_view, MPI_Datatype* local_view,
+	const int mpi_dim_size[], const int dim_size[], const int nghost[],
+	const MPI_Comm comm[])
+{
+	int sub_size[ndim], mpi_pos[ndim], pos[ndim];
+
+	int comm_rank, comm_size;
+	for (int k = 0; k < ndim; k++) {
+		MPI_Comm_rank(comm[k], &comm_rank);
+		MPI_Comm_size(comm[k], &comm_size);
+
+		// sub-size matching for (file,array) datatype pair //
+		sub_size[k] = dim_size[k];
+
+		pos[k] = 0;
+		mpi_pos[k] = par_local_offset(mpi_dim_size[k], nghost[k],
+			comm_rank, comm_size);
+	}
+
+	// File-type: global array //
+	MPI_Type_create_subarray(ndim, (int*)mpi_dim_size, sub_size, mpi_pos,
+		MPI_ORDER_C, mpi_type< T >(), file_view);
+	MPI_Type_commit(file_view);
+
+	// Local-array: including ghost cells (overlapping) //
+	MPI_Type_create_subarray(ndim, (int*)dim_size, sub_size, pos,
+		MPI_ORDER_C, mpi_type< T >(), local_view);
+	MPI_Type_commit(local_view);
+}
+
+// * Timer subroutines * //
+// --------------------- //
+inline double nse::timer_init()
+{
+	return omp_get_wtime();
+}
+
+inline void nse::timer_update(const double mark, double* timer)
+{
+	double dtime = omp_get_wtime() - mark;
+	(*timer) += dtime;
+}
+inline void nse::timer_update(const double mark,
+	double* timer_a, double* timer_b)
+{
+	double dtime = omp_get_wtime() - mark;
+	(*timer_a) += dtime;
+	(*timer_b) += dtime;
+}
diff --git a/mpi-com3d.cpp b/mpi-com3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70a14753c51659a165a296977a20d06d1bd02026
--- /dev/null
+++ b/mpi-com3d.cpp
@@ -0,0 +1,33 @@
+#include "mpi-com3d.h"
+
+// initialize: mpiCom3d constants
+size_t nse::mpiCom3d::exch_size = 0;
+size_t nse::mpiCom3d::exch_size_x = 0;
+size_t nse::mpiCom3d::exch_size_y = 0;
+size_t nse::mpiCom3d::exch_size_z = 0;
+size_t nse::mpiCom3d::exch_size_xy = 0;
+size_t nse::mpiCom3d::exch_size_xz = 0;
+size_t nse::mpiCom3d::exch_size_yz = 0;
+size_t nse::mpiCom3d::exch_size_xyz = 0;
+size_t nse::mpiCom3d::exch_size_sp[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+void* nse::mpiCom3d::exch = NULL;
+void* nse::mpiCom3d::exch_x = NULL;
+void* nse::mpiCom3d::exch_y = NULL;
+void* nse::mpiCom3d::exch_z = NULL;
+void* nse::mpiCom3d::exch_xy = NULL;
+void* nse::mpiCom3d::exch_xz = NULL;
+void* nse::mpiCom3d::exch_yz = NULL;
+void* nse::mpiCom3d::exch_xyz = NULL;
+void* nse::mpiCom3d::exch_sp[8] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+
+double nse::mpiCom3d::cpu_time_exch = (double)0;
+double nse::mpiCom3d::cpu_time_exch_x = (double)0;
+double nse::mpiCom3d::cpu_time_exch_y = (double) 0.0;
+double nse::mpiCom3d::cpu_time_exch_z = (double) 0.0;
+
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+MPI_Datatype nse::mpiCom3d::subarray_list[subarray_list_size] = {};
+int nse::mpiCom3d::subarray_info[subarray_list_size][10] = { {} };	// [status, size, subsize, offset]
+int nse::mpiCom3d::subarray_ptr = 0;
+#endif
diff --git a/mpi-com3d.h b/mpi-com3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c1d995e401111fb366e759adbb40a4c06333e12
--- /dev/null
+++ b/mpi-com3d.h
@@ -0,0 +1,3843 @@
+#pragma once
+
+// [mpi-com3d.h(cpp)]: 3D MPI communicator
+//
+// -------------------------------------------------------------------------------------------- //
+// TO DO:
+//	- use non-type template parameters to control omp barriers and memory storage
+//	- use persistent mpi communications (aka mpi_send[recv]_init)
+// NOTES:
+//	- direct corner exchanges are inefficient based on MSU cluster tests
+//	- colored periodic b.c's assume (mpi_nx[y,z] - 2 * gcx[y,z]) is even
+//	- MPI subarray type improves perfomance only slightly on MSU clusters
+//
+
+
+#include <mpi.h>
+#include <stdlib.h>
+
+#include "nse-sys.h"
+#include "nse-alloc.h"
+#include "mpi-com.h"
+#include "grid-common2d.h"
+#include "grid-common3d.h"
+
+
+//#define _MPI_EXCH3D_USE_SUBARRAY				// use MPI-subarray type for halo exchanges
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+#define _MPI_EXCH3D_SUBARRAY_MIN_SIZE	512		// min size for MPI-subarray halo exchanges
+#endif
+
+#define _MPI_EXCH3D_MEASURE_TEST_TIME					// define to measure MPI_Test calls
+
+
+
+namespace nse
+{
+	// * MPI communicator: mpiCom3d * //
+	// =======================================================================
+	class mpiCom3d
+	{
+	public:
+		mpiCom3d();
+		mpiCom3d(const mpiCom3d& mpi_com);
+		~mpiCom3d();
+
+		void set(const int ndim);
+		void set(const int _size_x, const int _size_y, const int _size_z);
+		void cleanup();
+
+		void copy(const mpiCom3d& mpi_com);
+		void split_comm(const mpiCom3d& mpi_com,
+			const int modx, const int mody, const int modz);
+
+		static void init();
+		static void clear();
+
+		int rank_id(const int rx, const int ry, const int rz) const;
+
+		int rank_id_x(const int rank) const;
+		int rank_id_y(const int rank) const;
+		int rank_id_z(const int rank) const;
+
+
+		// * gather data * //
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_x(T* _RESTRICT out, const T* _RESTRICT in, const int host, const int nx, const int gcx) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_y(T* _RESTRICT out, const T* _RESTRICT in, const int host, const int ny, const int gcy) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_z(T* _RESTRICT out, const T* _RESTRICT in, const int host, const int nz, const int gcz) const;
+
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_xy(T* _RESTRICT out, const T* _RESTRICT in, const int host,
+			const int nx, const int ny, const int gcx, const int gcy) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_xz(T* _RESTRICT out, const T* _RESTRICT in, const int host,
+			const int nx, const int nz, const int gcx, const int gcz) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_yz(T* _RESTRICT out, const T* _RESTRICT in, const int host,
+			const int ny, const int nz, const int gcy, const int gcz) const;
+
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather(T* _RESTRICT out, const T* _RESTRICT in, const int host,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz) const;
+		// ------------------------------------------------------------------ //
+
+		// * scatter data * //
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void scatter_x(T* _RESTRICT out, const T* _RESTRICT in, const int host, const int nx, const int gcx) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void scatter_y(T* _RESTRICT out, const T* _RESTRICT in, const int host, const int ny, const int gcy) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void scatter_z(T* _RESTRICT out, const T* _RESTRICT in, const int host, const int nz, const int gcz) const;
+
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void scatter_xy(T* _RESTRICT out, const T* _RESTRICT in, const int host,
+			const int nx, const int ny, const int gcx, const int gcy) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void scatter_xz(T* _RESTRICT out, const T* _RESTRICT in, const int host,
+			const int nx, const int nz, const int gcx, const int gcz) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void scatter_yz(T* _RESTRICT out, const T* _RESTRICT in, const int host,
+			const int ny, const int nz, const int gcy, const int gcz) const;
+
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void scatter(T* _RESTRICT out, const T* _RESTRICT in, const int host,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz) const;
+		// ------------------------------------------------------------------ //
+
+		// * gather-scatter data from(to) odd(now) processors * //
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_subgrid(T* out, const T* in,    // out[nx, ny, nz] <-- in[sub_nx, sub_ny, sub_nz]
+			const int nx, const int ny, const int nz,
+			const int sub_nx, const int sub_ny, const int sub_nz,
+			const int gcx, const int gcy, const int gcz) const;
+
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void scatter_subgrid(T* out, const T* in,   // out[sub_nx, sub_ny, sub_nz] <-- in[nx, ny, nz]
+			const int nx, const int ny, const int nz,
+			const int sub_nx, const int sub_ny, const int sub_nz,
+			const int gcx, const int gcy, const int gcz) const;
+
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_subgrid_x(T* out, const T* in,    // out[nx] <-- in[sub_nx]
+			const int nx, const int sub_nx, const int gcx) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_subgrid_y(T* out, const T* in,    // out[ny] <-- in[sub_ny]
+			const int ny, const int sub_ny, const int gcy) const;
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_subgrid_z(T* out, const T* in,    // out[nz] <-- in[sub_nz]
+			const int nz, const int sub_nz, const int gcz) const;
+
+		template< memType memOUT = memCPU, memType memIN = memCPU, typename T >
+		void gather_subgrid_line(T* out, const T* in,    // out[nx] <-- in[sub_nx]
+			const int nx, const int sub_nx, const int gcx,
+			const int rank, const int size, const MPI_Comm comm) const;
+		// ------------------------------------------------------------------ //
+
+		// * exchange data * //
+		// --------------------------------------------------------------------------------- //
+
+		// * sync mpi: cross halo ( no corners exchanges ), single sync for -x and -y and -z
+		template< memType mem = memCPU, typename T >
+		void exchange_cross_halo(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z) const;
+
+		// * sync mpi: full halo( including corners ), triple sync for -x, -y and -z
+		template< memType mem = memCPU, typename T >
+		void exchange_halo(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z) const;
+		template< memType mem = memCPU, typename T >
+		void exchange_halo(T* x, T* y,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z) const;
+
+		// * sync mpi: full color halo (including corners), triple sync for -x, -y and -z
+		template< memType mem = memCPU, typename T >
+		void exchange_color_halo(T* x, const int color,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z) const;
+
+		// * sync mpi: cross halo ( no corners exchanges ), single sync for -x and -y and -z
+		template< memType mem = memCPU, typename T >
+		void exchange_cross_halo(T* u, T* v, T* w,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z) const;
+
+		// * sync mpi: full halo( including corners ), combined for u, v, w staggered components
+		//        triple sync for -x[ u ], -y[ v ], -z[ w ]
+		//                        -y[ u ], -z[ v ], -x[ w ] and -z[ u ], -x[ v ], -y[ w ]
+		template< memType mem = memCPU, typename T >
+		void exchange_halo(T* u, T* v, T* w,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z) const;
+
+		// * sync mpi: halo by single coordinate (X)
+		template< memType mem = memCPU, typename T >
+		void exchange_halo_x(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x) const;
+
+		// * sync mpi: halo by single coordinate (Y)
+		template< memType mem = memCPU, typename T >
+		void exchange_halo_y(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_y) const;
+
+		// * sync mpi: halo by single coordinate (Z)
+		template< memType mem = memCPU, typename T >
+		void exchange_halo_z(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_z) const;
+		// --------------------------------------------------------------------------------- //
+
+		// * push-pop exchange data * //
+		// --------------------------------------------------------------------------------- //
+		// * async mpi exchange init: cross halo ( no corners exchanges )
+		template< memType mem = memCPU, typename T >
+		void push_exchange_cross_halo(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z,
+			MPI_Request mpi_req[12]) const;
+
+		// * async mpi exchange finalize: cross halo ( no corners exchanges )
+		template< memType mem = memCPU, typename T >
+		void pop_exchange_cross_halo(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z,
+			MPI_Request mpi_req[12]) const;
+
+		// * async - X mpi exchange init: full halo ( including corners exchanges )
+		template< memType mem = memCPU, typename T >
+		void push_exchange_halo_x(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, MPI_Request mpi_req[4]) const;
+
+		// * async - X mpi exchange finalize: full halo ( including corners exchanges )
+		template< memType mem = memCPU, typename T >
+		void pop_exchange_halo_x(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, MPI_Request mpi_req[4]) const;
+
+		// * async - Y mpi exchange init: full halo ( including corners exchanges )
+		template< memType mem = memCPU, typename T >
+		void push_exchange_halo_y(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_y, MPI_Request mpi_req[4]) const;
+
+		// * async - Y mpi exchange finalize: full halo ( including corners exchanges )
+		template< memType mem = memCPU, typename T >
+		void pop_exchange_halo_y(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_y, MPI_Request mpi_req[4]) const;
+
+		// * async - Z mpi exchange init: full halo ( including corners exchanges )
+		template< memType mem = memCPU, typename T >
+		void push_exchange_halo_z(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_z, MPI_Request mpi_req[4]) const;
+
+		// * async - Z mpi exchange finalize: full halo ( including corners exchanges )
+		template< memType mem = memCPU, typename T >
+		void pop_exchange_halo_z(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_z, MPI_Request mpi_req[4]) const;
+
+		// * async - X mpi exchange push: full color halo ( including corners )
+		template< memType mem = memCPU, typename T >
+		void push_exchange_color_halo_x(T* x, const int color,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, MPI_Request mpi_req[4]) const;
+
+		// * async - X mpi exchange finalize: full color halo ( including corners )
+		template< memType mem = memCPU, typename T >
+		void pop_exchange_color_halo_x(T* x, const int color,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, MPI_Request mpi_req[4]) const;
+
+		// * async - Y mpi exchange push: full color halo ( including corners )
+		template< memType mem = memCPU, typename T >
+		void push_exchange_color_halo_y(T* x, const int color,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_y, MPI_Request mpi_req[4]) const;
+
+		// * async - Y mpi exchange finalize: full color halo ( including corners )
+		template< memType mem = memCPU, typename T >
+		void pop_exchange_color_halo_y(T* x, const int color,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_y, MPI_Request mpi_req[4]) const;
+
+		// * async - Z mpi exchange push: full color halo ( including corners )
+		template< memType mem = memCPU, typename T >
+		void push_exchange_color_halo_z(T* x, const int color,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_z, MPI_Request mpi_req[4]) const;
+
+		// * async - Z mpi exchange finalize: full color halo ( including corners )
+		template< memType mem = memCPU, typename T >
+		void pop_exchange_color_halo_z(T* x, const int color,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_z, MPI_Request mpi_req[4]) const;
+		// --------------------------------------------------------------------------------- //
+
+		// * special async mpi full halo exchanges * //
+		// ----------------------------------------- //
+
+		// * async - mpi exchange push: full halo ( including corners )
+		//		exchange status ( =-1 - error, =0 - finalized, >0 [1=x, 2=y, 3=z] )
+		template< memType mem = memCPU, typename T >
+		void push_exchange_halo(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z,
+			MPI_Request mpi_req[4], int* status) const;
+
+		// * async - mpi exchange ping - advance exchange: full halo ( including corners )
+		//		exchange status ( =-1 - error, =0 - finalized, >0 [1=x, 2=y, 3=z] )
+		template< memType mem = memCPU, typename T >
+		void ping_exchange_halo(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z,
+			MPI_Request mpi_req[4], int* status) const;
+
+		// * async - mpi exchange finalize: full halo ( including corners )
+		//		exchange status ( =-1 - error, =0 - finalized )
+		template< memType mem = memCPU, typename T >
+		void pop_exchange_halo(T* x,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z,
+			MPI_Request mpi_req[4], int* status) const;
+
+		// * async - mpi exchange push: full color halo ( including corners )
+		//		exchange status ( =-1 - error, =0 - finalized, >0 [1=x, 2=y, 3=z] )
+		template< memType mem = memCPU, typename T >
+		void push_exchange_color_halo(T* x, const int color,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z,
+			MPI_Request mpi_req[4], int* status) const;
+
+		// * async - mpi exchange ping - advance exchange: full color halo ( including corners )
+		//		exchange status ( =-1 - error, =0 - finalized, >0 [1=x, 2=y, 3=z] )
+		template< memType mem = memCPU, typename T >
+		void ping_exchange_color_halo(T* x, const int color,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z,
+			MPI_Request mpi_req[4], int* status) const;
+
+		// * async - mpi exchange finalize: full color halo ( including corners )
+		//		exchange status ( =-1 - error, =0 - finalized )
+		template< memType mem = memCPU, typename T >
+		void pop_exchange_color_halo(T* x, const int color,
+			const int nx, const int ny, const int nz,
+			const int gcx, const int gcy, const int gcz,
+			const int hx, const int hy, const int hz,
+			const int periodic_x, const int periodic_y, const int periodic_z,
+			MPI_Request mpi_req[4], int* status) const;
+
+		// * async - test mpi exchange ( any exchange operator )
+		int test_exchange(MPI_Request* mpi_req, const int n_req) const;
+		// --------------------------------------------------------------------------------- //
+
+
+		int offset_x(const int nx, const int gcx) const;
+		int offset_y(const int ny, const int gcy) const;
+		int offset_z(const int nz, const int gcz) const;
+
+
+	public:     // communicator data declared public
+		MPI_Comm comm;
+		MPI_Comm comm_xy, comm_xz, comm_yz;	// additional plane communicators //
+		MPI_Comm comm_x, comm_y, comm_z;	// additional line communicators //
+											// comm free statuses //
+		int free_comm,
+			free_line_comm, free_plane_comm;
+
+		int rank, size;
+		int rank_x, size_x;
+		int rank_y, size_y;
+		int rank_z, size_z;
+
+		// MPI exchange timing
+		static double cpu_time_exch;
+		static double cpu_time_exch_x;
+		static double cpu_time_exch_y;
+		static double cpu_time_exch_z;
+
+	private:
+		// -------------------------------------------------------------------------- //
+		// buffer for exchange template operations
+		static void *exch;
+		// directional buffers for exchange template operations
+		static void *exch_x, *exch_y, *exch_z;
+		// corner buffers for exchange template operations
+		static void *exch_xy, *exch_xz, *exch_yz, *exch_xyz;
+		// corner buffers for gathering subgrids 
+		static void *exch_sp[8];
+
+		static size_t exch_size;
+		static size_t exch_size_x, exch_size_y, exch_size_z;
+		static size_t exch_size_xy, exch_size_xz, exch_size_yz;
+		static size_t exch_size_xyz;
+		static size_t exch_size_sp[8];
+
+		static void allocate_memory(void** mem, size_t* mem_size, const size_t new_size);
+		static void deallocate_memory(void* mem, size_t* mem_size);
+
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+		// MPI-subarray exchanges //
+		static const int subarray_list_size = 256;
+		static MPI_Datatype subarray_list[subarray_list_size];
+		static int subarray_info[subarray_list_size][10];	// [status, size, subsize, offset]
+		static int subarray_ptr;
+
+		static void init_subarray_list();
+		static void free_subarray_list();
+
+		template<typename T>
+		static void get_subarray(MPI_Datatype* subarray, const int nx, const int ny, const int nz,
+			const int ib, const int ie,
+			const int jb, const int je,
+			const int kb, const int ke);
+#endif
+	};
+	// =======================================================================
+
+}
+
+
+// [implementation]: mpiCom3d //
+// =======================================================================
+inline nse::mpiCom3d::mpiCom3d(
+) :
+	rank(0), size(1),
+	rank_x(0), size_x(1),
+	rank_y(0), size_y(1),
+	rank_z(0), size_z(1)
+{
+	free_comm = 0;
+	free_line_comm = 0;
+	free_plane_comm = 0;
+}
+
+inline nse::mpiCom3d::mpiCom3d(
+	const mpiCom3d& mpi_com)
+{
+	copy(mpi_com);
+}
+
+inline nse::mpiCom3d :: ~mpiCom3d()
+{
+	cleanup();
+}
+
+inline void nse::mpiCom3d::set(
+	const int ndim)
+{
+	if ((ndim <= 0) || (ndim > 3)) return;
+
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (ndim == 1) {
+		set(size, 1, 1);
+	}
+	if (ndim == 2) {
+		mpi_com_dims(size, &size_x, &size_y);
+		set(size_x, size_y, 1);
+	}
+	if (ndim == 3) {
+		mpi_com_dims(size, &size_x, &size_y, &size_z);
+		set(size_x, size_y, size_z);
+	}
+}
+
+inline void nse::mpiCom3d::set(
+	const int _size_x, const int _size_y, const int _size_z)
+{
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (_size_x * _size_y * _size_z != size)
+	{
+		if ((_size_y == 1) && (_size_z == 1)) set(1);
+		else
+			if (_size_z == 1) set(2);
+			else
+				set(3);
+
+		return;
+	}
+
+	cleanup();
+	comm = MPI_COMM_WORLD;
+	MPI_Comm_rank(comm, &rank);
+
+	size_x = _size_x;
+	size_y = _size_y;
+	size_z = _size_z;
+
+	rank_x = rank_id_x(rank);
+	rank_y = rank_id_y(rank);
+	rank_z = rank_id_z(rank);
+
+	// plane communicators //
+	MPI_Comm_split(comm, rank_z, rank_y * size_x + rank_x, &comm_xy);
+	MPI_Comm_split(comm, rank_y, rank_z * size_x + rank_x, &comm_xz);
+	MPI_Comm_split(comm, rank_x, rank_z * size_y + rank_y, &comm_yz);
+
+	// line communicators //
+	MPI_Comm_split(comm, rank_z * size_y + rank_y, rank_x, &comm_x);
+	MPI_Comm_split(comm, rank_z * size_x + rank_x, rank_y, &comm_y);
+	MPI_Comm_split(comm, rank_y * size_x + rank_x, rank_z, &comm_z);
+
+	free_comm = 0;
+	free_line_comm = 1;
+	free_plane_comm = 1;
+}
+
+inline void nse::mpiCom3d::cleanup()
+{
+	// delete initialized communicators //
+	if (free_comm) {
+		MPI_Comm_free(&comm);
+
+		free_comm = 0;
+	}
+	if (free_line_comm) {
+		MPI_Comm_free(&comm_x);
+		MPI_Comm_free(&comm_y);
+		MPI_Comm_free(&comm_z);
+
+		free_line_comm = 0;
+	}
+	if (free_plane_comm) {
+		MPI_Comm_free(&comm_xy);
+		MPI_Comm_free(&comm_xz);
+		MPI_Comm_free(&comm_yz);
+
+		free_plane_comm = 0;
+	}
+}
+
+inline void nse::mpiCom3d::copy(const mpiCom3d& mpi_com)
+{
+	comm = mpi_com.comm;
+	comm_xy = mpi_com.comm_xy;
+	comm_xz = mpi_com.comm_xz;
+	comm_yz = mpi_com.comm_yz;
+	comm_x = mpi_com.comm_x;
+	comm_y = mpi_com.comm_y;
+	comm_z = mpi_com.comm_z;
+
+	rank = mpi_com.rank;
+	size = mpi_com.size;
+
+	rank_x = mpi_com.rank_x; size_x = mpi_com.size_x;
+	rank_y = mpi_com.rank_y; size_y = mpi_com.size_y;
+	rank_z = mpi_com.rank_z; size_z = mpi_com.size_z;
+
+	free_comm = 0;
+	free_line_comm = 0;
+	free_plane_comm = 0;
+}
+
+inline void nse::mpiCom3d::split_comm(const mpiCom3d& mpi_com,
+	const int modx, const int mody, const int modz)
+{
+	int color = ((mpi_com.rank_x % modx == 0) &&
+		(mpi_com.rank_y % mody == 0) &&
+		(mpi_com.rank_z % modz == 0));
+
+	MPI_Comm_split(mpi_com.comm, color, mpi_com.rank, &comm);
+	MPI_Comm_size(comm, &size);
+	MPI_Comm_rank(comm, &rank);
+
+	size_x = (mpi_com.size_x + (modx - 1)) / modx;
+	size_y = (mpi_com.size_y + (mody - 1)) / mody;
+	size_z = (mpi_com.size_z + (modz - 1)) / modz;
+
+	rank_x = rank % size_x;
+	rank_y = (rank / size_x) % size_y;
+	rank_z = (rank / (size_x * size_y)) % size_z;
+
+	// plane communicators //
+	MPI_Comm_split(comm, rank_z, rank_y * size_x + rank_x, &comm_xy);
+	MPI_Comm_split(comm, rank_y, rank_z * size_x + rank_x, &comm_xz);
+	MPI_Comm_split(comm, rank_x, rank_z * size_y + rank_y, &comm_yz);
+
+	// line communicators //
+	MPI_Comm_split(comm, rank_z * size_y + rank_y, rank_x, &comm_x);
+	MPI_Comm_split(comm, rank_z * size_x + rank_x, rank_y, &comm_y);
+	MPI_Comm_split(comm, rank_y * size_x + rank_x, rank_z, &comm_z);
+
+	free_comm = 1;
+	free_line_comm = 1;
+	free_plane_comm = 1;
+}
+
+inline void nse::mpiCom3d::init()
+{
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+	init_subarray_list();
+#endif
+}
+
+inline void nse::mpiCom3d::clear()
+{
+	deallocate_memory(exch, &exch_size);
+	deallocate_memory(exch_x, &exch_size_x);
+	deallocate_memory(exch_y, &exch_size_y);
+	deallocate_memory(exch_z, &exch_size_z);
+	deallocate_memory(exch_xy, &exch_size_xy);
+	deallocate_memory(exch_xz, &exch_size_xz);
+	deallocate_memory(exch_yz, &exch_size_yz);
+	deallocate_memory(exch_xyz, &exch_size_xyz);
+	for (int k = 0; k < 8; k++)
+		deallocate_memory(exch_sp[k], &exch_size_sp[k]);
+
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+	free_subarray_list();
+#endif
+}
+
+inline int nse::mpiCom3d::rank_id(
+	const int rx, const int ry, const int rz) const
+{
+	return rz * size_x * size_y + ry * size_x + rx;
+}
+
+inline int nse::mpiCom3d::rank_id_x(const int rank) const {
+	return rank % size_x;
+}
+inline int nse::mpiCom3d::rank_id_y(const int rank) const {
+	return (rank / size_x) % size_y;
+}
+inline int nse::mpiCom3d::rank_id_z(const int rank) const {
+	return (rank / (size_x * size_y)) % size_z;
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_x(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int nx, const int gcx) const
+{
+	int mpi_nx = mpi_allreduce(nx - 2 * gcx, MPI_SUM, comm_x) + 2 * gcx;
+	int host_x = rank_id_x(host);
+	int host_y = rank_id_y(host);
+	int host_z = rank_id_z(host);
+
+	// missing check: [size-decomposition-input], do before gathering! //
+	// not thread safe: global array size reduction //
+
+	if ((rank_y == host_y) && (rank_z == host_z)) {	// gathering on -x communicator //
+
+		int i, spos, spnx, shxb, shxe;
+		int *pos, *pnx;
+
+		allocate_memory(&exch_x, &exch_size_x, 2 * sizeof(int)*size_x);
+		pos = (int*)exch_x;
+		pnx = &((int*)exch_x)[size_x];
+
+		for (i = 0; i < size_x; i++) {
+			shxb = (i == 0) ? 0 : gcx;
+			shxe = (i == size_x - 1) ? 0 : gcx;
+
+			pos[i] = par_local_offset(mpi_nx, gcx, i, size_x) + shxb;
+			pnx[i] = par_local_size(mpi_nx, gcx, i, size_x) - shxb - shxe;
+		}
+
+		shxb = (rank_x == 0) ? 0 : gcx;
+		shxe = (rank_x == size_x - 1) ? 0 : gcx;
+		spos = shxb;
+		spnx = nx - shxb - shxe;
+
+		mpi_gather_vec<memIN, memOUT>(&in[spos], spnx, out, pnx, pos,
+			host_x, comm_x);
+	}
+
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_y(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int ny, const int gcy) const
+{
+	int mpi_ny = mpi_allreduce(ny - 2 * gcy, MPI_SUM, comm_y) + 2 * gcy;
+	int host_x = rank_id_x(host);
+	int host_y = rank_id_y(host);
+	int host_z = rank_id_z(host);
+
+	// missing check: [size-decomposition-input], do before gathering! //
+	// not thread safe: global array size reduction //
+
+	if ((rank_x == host_x) && (rank_z == host_z)) {	// gathering on -y communicator //
+
+		int j, spos, spny, shyb, shye;
+		int *pos, *pny;
+
+		allocate_memory(&exch_y, &exch_size_y, 2 * sizeof(int)*size_y);
+		pos = (int*)exch_y;
+		pny = &((int*)exch_y)[size_y];
+
+		for (j = 0; j < size_y; j++) {
+			shyb = (j == 0) ? 0 : gcy;
+			shye = (j == size_y - 1) ? 0 : gcy;
+
+			pos[j] = par_local_offset(mpi_ny, gcy, j, size_y) + shyb;
+			pny[j] = par_local_size(mpi_ny, gcy, j, size_y) - shyb - shye;
+		}
+
+		shyb = (rank_y == 0) ? 0 : gcy;
+		shye = (rank_y == size_y - 1) ? 0 : gcy;
+		spos = shyb;
+		spny = ny - shyb - shye;
+
+		mpi_gather_vec<memIN, memOUT>(&in[spos], spny, out, pny, pos,
+			host_y, comm_y);
+	}
+
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_z(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int nz, const int gcz) const
+{
+	int mpi_nz = mpi_allreduce(nz - 2 * gcz, MPI_SUM, comm_z) + 2 * gcz;
+	int host_x = rank_id_x(host);
+	int host_y = rank_id_y(host);
+	int host_z = rank_id_z(host);
+
+	// missing check: [size-decomposition-input], do before gathering! //
+	// not thread safe: global array size reduction //
+
+	if ((rank_x == host_x) && (rank_y == host_y)) {	// gathering on -z communicator //
+
+		int k, spos, spnz, shzb, shze;
+		int *pos, *pnz;
+
+		allocate_memory(&exch_z, &exch_size_z, 2 * sizeof(int)*size_z);
+		pos = (int*)exch_z;
+		pnz = &((int*)exch_z)[size_z];
+
+		for (k = 0; k < size_z; k++) {
+			shzb = (k == 0) ? 0 : gcz;
+			shze = (k == size_z - 1) ? 0 : gcz;
+
+			pos[k] = par_local_offset(mpi_nz, gcz, k, size_z) + shzb;
+			pnz[k] = par_local_size(mpi_nz, gcz, k, size_z) - shzb - shze;
+		}
+
+		shzb = (rank_z == 0) ? 0 : gcz;
+		shze = (rank_z == size_z - 1) ? 0 : gcz;
+		spos = shzb;
+		spnz = nz - shzb - shze;
+
+		mpi_gather_vec<memIN, memOUT>(&in[spos], spnz, out, pnz, pos,
+			host_z, comm_z);
+	}
+
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_xy(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int nx, const int ny, const int gcx, const int gcy) const
+{
+	int mpi_nx = mpi_allreduce(nx - 2 * gcx, MPI_SUM, comm_x) + 2 * gcx;
+	int mpi_ny = mpi_allreduce(ny - 2 * gcy, MPI_SUM, comm_y) + 2 * gcy;
+	int host_z = rank_id_z(host);
+
+	// missing check: [size-decomposition-input], do before gathering! //
+	// not thread safe: global array size reduction //
+
+	if (rank == host) // write array on processor with rank = host
+	{
+		int i, j;
+		int posx, posy, pnx, pny, prank;
+		int shxb, shxe, shyb, shye;
+		T *mem;
+
+		// recieve sub array from each processor //
+		for (i = 0; i < size_x; i++) {
+
+			// define -x [offset,size] for [i] processor //
+			shxb = (i == 0) ? 0 : gcx;
+			shxe = (i == size_x - 1) ? 0 : gcx;
+			posx = par_local_offset(mpi_nx, gcx, i, size_x) + shxb;
+			pnx = par_local_size(mpi_nx, gcx, i, size_x);
+
+			for (j = 0; j < size_y; j++) {
+
+				// define -y [offset,size] for [j] processor //
+				shyb = (j == 0) ? 0 : gcy;
+				shye = (j == size_y - 1) ? 0 : gcy;
+				posy = par_local_offset(mpi_ny, gcy, j, size_y) + shyb;
+				pny = par_local_size(mpi_ny, gcy, j, size_y);
+
+				if ((i == rank_x) && (j == rank_y))
+				{
+					copy_sub_array<memOUT, memIN>(out, mpi_nx, mpi_ny, posx, posy,
+						in, pnx, pny,
+						shxb, pnx - shxe - 1,
+						shyb, pny - shye - 1);
+					continue;
+				}
+
+				allocate_memory(&exch_xy, &exch_size_xy, sizeof(T)*pnx*pny);
+				mem = (T*)exch_xy;
+
+				prank = rank_id(i, j, host_z);
+				MPI_Recv(mem, pnx * pny, mpi_type< T >(), prank, 0,
+					comm, MPI_STATUS_IGNORE);
+
+				copy_sub_array<memOUT, memCPU>(out, mpi_nx, mpi_ny, posx, posy,
+					mem, pnx, pny,
+					shxb, pnx - shxe - 1,
+					shyb, pny - shye - 1);
+			}
+		}
+	}
+	else
+	{
+		if (rank_z == host_z)
+			mpi_send<memIN>(in, nx * ny, host, 0, comm);
+	}
+
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_xz(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int nx, const int nz, const int gcx, const int gcz) const
+{
+	int mpi_nx = mpi_allreduce(nx - 2 * gcx, MPI_SUM, comm_x) + 2 * gcx;
+	int mpi_nz = mpi_allreduce(nz - 2 * gcz, MPI_SUM, comm_z) + 2 * gcz;
+	int host_y = rank_id_y(host);
+
+	// missing check: [size-decomposition-input], do before gathering! //
+	// not thread safe: global array size reduction //
+
+	if (rank == host) // write array on processor with rank = host
+	{
+		int i, k;
+		int posx, posz, pnx, pnz, prank;
+		int shxb, shxe, shzb, shze;
+		T *mem;
+
+		// recieve sub array from each processor //
+		for (i = 0; i < size_x; i++) {
+
+			// define -x [offset,size] for [i] processor //
+			shxb = (i == 0) ? 0 : gcx;
+			shxe = (i == size_x - 1) ? 0 : gcx;
+			posx = par_local_offset(mpi_nx, gcx, i, size_x) + shxb;
+			pnx = par_local_size(mpi_nx, gcx, i, size_x);
+
+			for (k = 0; k < size_z; k++) {
+
+				// define -z [offset,size] for [k] processor //
+				shzb = (k == 0) ? 0 : gcz;
+				shze = (k == size_z - 1) ? 0 : gcz;
+				posz = par_local_offset(mpi_nz, gcz, k, size_z) + shzb;
+				pnz = par_local_size(mpi_nz, gcz, k, size_z);
+
+				if ((i == rank_x) && (k == rank_z))
+				{
+					copy_sub_array<memOUT, memIN>(out, mpi_nx, mpi_nz, posx, posz,
+						in, pnx, pnz,
+						shxb, pnx - shxe - 1,
+						shzb, pnz - shze - 1);
+					continue;
+				}
+
+				allocate_memory(&exch_xz, &exch_size_xz, sizeof(T)*pnx*pnz);
+				mem = (T*)exch_xz;
+
+				prank = rank_id(i, host_y, k);
+				MPI_Recv(mem, pnx * pnz, mpi_type< T >(), prank, 0,
+					comm, MPI_STATUS_IGNORE);
+
+				copy_sub_array<memOUT, memCPU>(out, mpi_nx, mpi_nz, posx, posz,
+					mem, pnx, pnz,
+					shxb, pnx - shxe - 1,
+					shzb, pnz - shze - 1);
+			}
+		}
+	}
+	else
+	{
+		if (rank_y == host_y)
+			mpi_send<memIN>(in, nx * nz, host, 0, comm);
+	}
+
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_yz(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int ny, const int nz, const int gcy, const int gcz) const
+{
+	int mpi_ny = mpi_allreduce(ny - 2 * gcy, MPI_SUM, comm_y) + 2 * gcy;
+	int mpi_nz = mpi_allreduce(nz - 2 * gcz, MPI_SUM, comm_z) + 2 * gcz;
+	int host_x = rank_id_x(host);
+
+	// missing check: [size-decomposition-input], do before gathering! //
+	// not thread safe: global array size reduction //
+
+	if (rank == host) // write array on processor with rank = host
+	{
+		int j, k;
+		int posy, posz, pny, pnz, prank;
+		int shyb, shye, shzb, shze;
+		T *mem;
+
+		// recieve sub array from each processor //
+		for (j = 0; j < size_y; j++) {
+
+			// define -y [offset,size] for [j] processor //
+			shyb = (j == 0) ? 0 : gcy;
+			shye = (j == size_y - 1) ? 0 : gcy;
+			posy = par_local_offset(mpi_ny, gcy, j, size_y) + shyb;
+			pny = par_local_size(mpi_ny, gcy, j, size_y);
+
+			for (k = 0; k < size_z; k++) {
+
+				// define -z [offset,size] for [k] processor //
+				shzb = (k == 0) ? 0 : gcz;
+				shze = (k == size_z - 1) ? 0 : gcz;
+				posz = par_local_offset(mpi_nz, gcz, k, size_z) + shzb;
+				pnz = par_local_size(mpi_nz, gcz, k, size_z);
+
+				if ((j == rank_y) && (k == rank_z))
+				{
+					copy_sub_array<memOUT, memIN>(out, mpi_ny, mpi_nz, posy, posz,
+						in, pny, pnz,
+						shyb, pny - shye - 1,
+						shzb, pnz - shze - 1);
+					continue;
+				}
+
+				allocate_memory(&exch_yz, &exch_size_yz, sizeof(T)*pny*pnz);
+				mem = (T*)exch_yz;
+
+				prank = rank_id(host_x, j, k);
+				MPI_Recv(mem, pny * pnz, mpi_type< T >(), prank, 0,
+					comm, MPI_STATUS_IGNORE);
+
+				copy_sub_array<memOUT, memCPU>(out, mpi_ny, mpi_nz, posy, posz,
+					mem, pny, pnz,
+					shyb, pny - shye - 1,
+					shzb, pnz - shze - 1);
+			}
+		}
+	}
+	else
+	{
+		if (rank_x == host_x)
+			mpi_send<memIN>(in, ny * nz, host, 0, comm);
+	}
+
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz) const
+{
+	int mpi_nx = mpi_allreduce(nx - 2 * gcx, MPI_SUM, comm_x) + 2 * gcx;
+	int mpi_ny = mpi_allreduce(ny - 2 * gcy, MPI_SUM, comm_y) + 2 * gcy;
+	int mpi_nz = mpi_allreduce(nz - 2 * gcz, MPI_SUM, comm_z) + 2 * gcz;
+
+	// missing check: [size-decomposition-input], do before gathering! //
+	// not thread safe: global array size reduction //
+
+	if (rank == host) // write array on processor with rank = host
+	{
+		int i, j, k;
+		int posx, posy, posz, pnx, pny, pnz, prank;
+		int shxb, shxe, shyb, shye, shzb, shze;
+		T *mem;
+
+		// recieve sub array from each processor //
+		for (i = 0; i < size_x; i++) {
+
+			// define -x [offset,size] for [i] processor //
+			shxb = (i == 0) ? 0 : gcx;
+			shxe = (i == size_x - 1) ? 0 : gcx;
+			posx = par_local_offset(mpi_nx, gcx, i, size_x) + shxb;
+			pnx = par_local_size(mpi_nx, gcx, i, size_x);
+
+			for (j = 0; j < size_y; j++) {
+
+				// define -y [offset,size] for [j] processor //
+				shyb = (j == 0) ? 0 : gcy;
+				shye = (j == size_y - 1) ? 0 : gcy;
+				posy = par_local_offset(mpi_ny, gcy, j, size_y) + shyb;
+				pny = par_local_size(mpi_ny, gcy, j, size_y);
+
+				for (k = 0; k < size_z; k++) {
+
+					// define -z [offset,size] for [k] processor //
+					shzb = (k == 0) ? 0 : gcz;
+					shze = (k == size_z - 1) ? 0 : gcz;
+					posz = par_local_offset(mpi_nz, gcz, k, size_z) + shzb;
+					pnz = par_local_size(mpi_nz, gcz, k, size_z);
+
+					if ((i == rank_x) && (j == rank_y) && (k == rank_z))
+					{
+						copy_sub_array<memOUT, memIN>(out, mpi_nx, mpi_ny, mpi_nz, posx, posy, posz,
+							in, pnx, pny, pnz,
+							shxb, pnx - shxe - 1,
+							shyb, pny - shye - 1,
+							shzb, pnz - shze - 1);
+						continue;
+					}
+
+					allocate_memory(&exch, &exch_size, sizeof(T)*pnx*pny*pnz);
+					mem = (T*)exch;
+
+					prank = rank_id(i, j, k);
+					MPI_Recv(mem, pnx * pny * pnz, mpi_type< T >(), prank, 0,
+						comm, MPI_STATUS_IGNORE);
+
+					copy_sub_array<memOUT, memCPU>(out, mpi_nx, mpi_ny, mpi_nz, posx, posy, posz,
+						mem, pnx, pny, pnz,
+						shxb, pnx - shxe - 1,
+						shyb, pny - shye - 1,
+						shzb, pnz - shze - 1);
+				}
+			}
+		}
+	}
+	else
+	{
+		mpi_send<memIN>(in, nx * ny * nz, host, 0, comm);
+	}
+
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::scatter_x(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int nx, const int gcx) const
+{
+	int mpi_nx = mpi_allreduce(nx - 2 * gcx, MPI_SUM, comm_x) + 2 * gcx;
+	int host_y = rank_id_y(host);
+	int host_z = rank_id_z(host);
+
+	// missing check: [size-decomposition-input], do before scattering! //
+	// not thread safe: global array size reduction //
+	// not using MPI_Scatterv() due to overlapping data in sends: check standard //
+
+	if (rank == host) // send sub-arrays on processor with rank = host
+	{
+		int i, posx, pnx, prank;
+		T *mem;
+
+		for (i = 0; i < size_x; i++) {
+
+			// define -x [offset,size] for [i] processor //
+			posx = par_local_offset(mpi_nx, gcx, i, size_x);
+			pnx = par_local_size(mpi_nx, gcx, i, size_x);
+
+			if (i == rank_x)
+			{
+				mcopy<memOUT, memIN>(out, &in[posx], pnx);
+				continue;
+			}
+
+			// copy to temporary memory //
+			allocate_memory(&exch_x, &exch_size_x, pnx * sizeof(T));
+			mem = (T*)exch_x;
+
+			mcopy<memCPU, memIN>(mem, &in[posx], pnx);
+
+			prank = rank_id(i, host_y, host_z);
+			MPI_Send(mem, pnx, mpi_type< T >(), prank, 0, comm);
+		}
+	}
+	else
+	{
+		if ((rank_y == host_y) && (rank_z == host_z))
+			mpi_recv<memOUT>(out, nx, host, 0, comm, MPI_STATUS_IGNORE);
+	}
+
+	mpi_broadcast<memOUT>(out, nx, host_z * size_y + host_y, comm_yz);
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::scatter_y(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int ny, const int gcy) const
+{
+	int mpi_ny = mpi_allreduce(ny - 2 * gcy, MPI_SUM, comm_y) + 2 * gcy;
+	int host_x = rank_id_x(host);
+	int host_z = rank_id_z(host);
+
+	// missing check: [size-decomposition-input], do before scattering! //
+	// not thread safe: global array size reduction //
+	// not using MPI_Scatterv() due to overlapping data in sends: check standard //
+
+	if (rank == host) // send sub-arrays on processor with rank = host
+	{
+		int j, posy, pny, prank;
+		T *mem;
+
+		for (j = 0; j < size_y; j++) {
+
+			// define -y [offset,size] for [j] processor //
+			posy = par_local_offset(mpi_ny, gcy, j, size_y);
+			pny = par_local_size(mpi_ny, gcy, j, size_y);
+
+			if (j == rank_y)
+			{
+				mcopy<memOUT, memIN>(out, &in[posy], pny);
+				continue;
+			}
+
+			// copy to temporary memory //
+			allocate_memory(&exch_y, &exch_size_y, pny * sizeof(T));
+			mem = (T*)exch_y;
+
+			mcopy<memCPU, memIN>(mem, &in[posy], pny);
+
+			prank = rank_id(host_x, j, host_z);
+			MPI_Send(mem, pny, mpi_type< T >(), prank, 0, comm);
+		}
+	}
+	else
+	{
+		if ((rank_x == host_x) && (rank_z == host_z))
+			mpi_recv<memOUT>(out, ny, host, 0, comm, MPI_STATUS_IGNORE);
+	}
+
+	mpi_broadcast<memOUT>(out, ny, host_z * size_x + host_x, comm_xz);
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::scatter_z(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int nz, const int gcz) const
+{
+	int mpi_nz = mpi_allreduce(nz - 2 * gcz, MPI_SUM, comm_z) + 2 * gcz;
+	int host_x = rank_id_x(host);
+	int host_y = rank_id_y(host);
+
+	// missing check: [size-decomposition-input], do before scattering! //
+	// not thread safe: global array size reduction //
+	// not using MPI_Scatterv() due to overlapping data in sends: check standard //
+
+	if (rank == host) // send sub-arrays on processor with rank = host
+	{
+		int k, posz, pnz, prank;
+		T *mem;
+
+		for (k = 0; k < size_z; k++) {
+
+			// define -z [offset,size] for [k] processor //
+			posz = par_local_offset(mpi_nz, gcz, k, size_z);
+			pnz = par_local_size(mpi_nz, gcz, k, size_z);
+
+			if (k == rank_z)
+			{
+				mcopy<memOUT, memIN>(out, &in[posz], pnz);
+				continue;
+			}
+
+			// copy to temporary memory //
+			allocate_memory(&exch_z, &exch_size_z, pnz * sizeof(T));
+			mem = (T*)exch_z;
+
+			mcopy<memCPU, memIN>(mem, &in[posz], pnz);
+
+			prank = rank_id(host_x, host_y, k);
+			MPI_Send(mem, pnz, mpi_type< T >(), prank, 0, comm);
+		}
+	}
+	else
+	{
+		if ((rank_x == host_x) && (rank_y == host_y))
+			mpi_recv<memOUT>(out, nz, host, 0, comm, MPI_STATUS_IGNORE);
+	}
+
+	mpi_broadcast<memOUT>(out, nz, host_y * size_x + host_x, comm_xy);
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::scatter_xy(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int nx, const int ny, const int gcx, const int gcy) const
+{
+	int mpi_nx = mpi_allreduce(nx - 2 * gcx, MPI_SUM, comm_x) + 2 * gcx;
+	int mpi_ny = mpi_allreduce(ny - 2 * gcy, MPI_SUM, comm_y) + 2 * gcy;
+	int host_z = rank_id_z(host);
+
+	// missing check: [size-decomposition-input], do before scattering! //
+	// not thread safe: global array size reduction //
+
+	if (rank == host) // send sub-arrays on processor with rank = host
+	{
+		int i, j;
+		int posx, posy, pnx, pny, prank;
+		T *mem;
+
+		for (i = 0; i < size_x; i++) {
+
+			// define -x [offset,size] for [i] processor
+			posx = par_local_offset(mpi_nx, gcx, i, size_x);
+			pnx = par_local_size(mpi_nx, gcx, i, size_x);
+
+			for (j = 0; j < size_y; j++) {
+
+				// define -y [offset,size] for [j] processor
+				posy = par_local_offset(mpi_ny, gcy, j, size_y);
+				pny = par_local_size(mpi_ny, gcy, j, size_y);
+
+				if ((i == rank_x) && (j == rank_y))
+				{
+					get_sub_array<memIN, memOUT>(in, mpi_nx, mpi_ny,
+						posx, posx + pnx - 1, posy, posy + pny - 1, out);
+					continue;
+				}
+
+				// copy to temporary memory
+				allocate_memory(&exch_xy, &exch_size_xy, pnx * pny * sizeof(T));
+				mem = (T*)exch_xy;
+
+				get_sub_array<memIN, memCPU>(in, mpi_nx, mpi_ny,
+					posx, posx + pnx - 1, posy, posy + pny - 1, mem);
+
+				prank = rank_id(i, j, host_z);
+				MPI_Send(mem, pnx * pny, mpi_type< T >(), prank, 0, comm);
+			}
+		}
+	}
+	else
+	{
+		if (rank_z == host_z)
+			mpi_recv<memOUT>(out, nx * ny, host, 0, comm, MPI_STATUS_IGNORE);
+	}
+
+	mpi_broadcast<memOUT>(out, nx * ny, host_z, comm_z);
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::scatter_xz(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int nx, const int nz, const int gcx, const int gcz) const
+{
+	int mpi_nx = mpi_allreduce(nx - 2 * gcx, MPI_SUM, comm_x) + 2 * gcx;
+	int mpi_nz = mpi_allreduce(nz - 2 * gcz, MPI_SUM, comm_z) + 2 * gcz;
+	int host_y = rank_id_y(host);
+
+	// missing check: [size-decomposition-input], do before scattering! //
+	// not thread safe: global array size reduction //
+
+	if (rank == host) // send sub-arrays on processor with rank = host
+	{
+		int i, k;
+		int posx, posz, pnx, pnz, prank;
+		T *mem;
+
+		for (i = 0; i < size_x; i++) {
+
+			// define -x [offset,size] for [i] processor
+			posx = par_local_offset(mpi_nx, gcx, i, size_x);
+			pnx = par_local_size(mpi_nx, gcx, i, size_x);
+
+			for (k = 0; k < size_z; k++) {
+
+				// define -z [offset,size] for [k] processor
+				posz = par_local_offset(mpi_nz, gcz, k, size_z);
+				pnz = par_local_size(mpi_nz, gcz, k, size_z);
+
+				if ((i == rank_x) && (k == rank_z))
+				{
+					get_sub_array<memIN, memOUT>(in, mpi_nx, mpi_nz,
+						posx, posx + pnx - 1, posz, posz + pnz - 1, out);
+					continue;
+				}
+
+				// copy to temporary memory
+				allocate_memory(&exch_xz, &exch_size_xz, pnx * pnz * sizeof(T));
+				mem = (T*)exch_xz;
+
+				get_sub_array<memIN, memCPU>(in, mpi_nx, mpi_nz,
+					posx, posx + pnx - 1, posz, posz + pnz - 1, mem);
+
+				prank = rank_id(i, host_y, k);
+				MPI_Send(mem, pnx * pnz, mpi_type< T >(), prank, 0, comm);
+			}
+		}
+	}
+	else
+	{
+		if (rank_y == host_y)
+			mpi_recv<memOUT>(out, nx * nz, host, 0, comm, MPI_STATUS_IGNORE);
+	}
+
+	mpi_broadcast<memOUT>(out, nx * nz, host_y, comm_y);
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::scatter_yz(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int ny, const int nz, const int gcy, const int gcz) const
+{
+	int mpi_ny = mpi_allreduce(ny - 2 * gcy, MPI_SUM, comm_y) + 2 * gcy;
+	int mpi_nz = mpi_allreduce(nz - 2 * gcz, MPI_SUM, comm_z) + 2 * gcz;
+	int host_x = rank_id_x(host);
+
+	// missing check: [size-decomposition-input], do before scattering! //
+	// not thread safe: global array size reduction //
+
+	if (rank == host) // send sub-arrays on processor with rank = host
+	{
+		int j, k;
+		int posy, posz, pny, pnz, prank;
+		T *mem;
+
+		for (j = 0; j < size_y; j++) {
+
+			// define -y [offset,size] for [j] processor
+			posy = par_local_offset(mpi_ny, gcy, j, size_y);
+			pny = par_local_size(mpi_ny, gcy, j, size_y);
+
+			for (k = 0; k < size_z; k++) {
+
+				// define -z [offset,size] for [k] processor
+				posz = par_local_offset(mpi_nz, gcz, k, size_z);
+				pnz = par_local_size(mpi_nz, gcz, k, size_z);
+
+				if ((j == rank_y) && (k == rank_z)) {
+					get_sub_array<memIN, memOUT>(in, mpi_ny, mpi_nz,
+						posy, posy + pny - 1, posz, posz + pnz - 1, out);
+					continue;
+				}
+
+				// copy to temporary memory
+				allocate_memory(&exch_yz, &exch_size_yz, pny * pnz * sizeof(T));
+				mem = (T*)exch_yz;
+
+				get_sub_array<memIN, memCPU>(in, mpi_ny, mpi_nz,
+					posy, posy + pny - 1, posz, posz + pnz - 1, mem);
+
+				prank = rank_id(host_x, j, k);
+				MPI_Send(mem, pny * pnz, mpi_type< T >(), prank, 0, comm);
+			}
+		}
+	}
+	else
+	{
+		if (rank_x == host_x)
+			mpi_recv<memOUT>(out, ny * nz, host, 0, comm, MPI_STATUS_IGNORE);
+	}
+
+	mpi_broadcast<memOUT>(out, ny * nz, host_x, comm_x);
+	MPI_Barrier(comm);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::scatter(
+	T* _RESTRICT out, const T* _RESTRICT in, const int host,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz) const
+{
+	int mpi_nx = mpi_allreduce(nx - 2 * gcx, MPI_SUM, comm_x) + 2 * gcx;
+	int mpi_ny = mpi_allreduce(ny - 2 * gcy, MPI_SUM, comm_y) + 2 * gcy;
+	int mpi_nz = mpi_allreduce(nz - 2 * gcz, MPI_SUM, comm_z) + 2 * gcz;
+
+	// missing check: [size-decomposition-input], do before scattering! //
+	// not thread safe: global array size reduction //
+
+	if (rank == host) // send sub-arrays on processor with rank = host
+	{
+		int i, j, k;
+		int posx, posy, posz, pnx, pny, pnz, prank;
+		T *mem;
+
+		for (i = 0; i < size_x; i++) {
+
+			// define -x [offset,size] for [i] processor
+			posx = par_local_offset(mpi_nx, gcx, i, size_x);
+			pnx = par_local_size(mpi_nx, gcx, i, size_x);
+
+			for (j = 0; j < size_y; j++) {
+
+				// define -y [offset,size] for [j] processor
+				posy = par_local_offset(mpi_ny, gcy, j, size_y);
+				pny = par_local_size(mpi_ny, gcy, j, size_y);
+
+				for (k = 0; k < size_z; k++) {
+
+					// define -z [offset,size] for [k] processor
+					posz = par_local_offset(mpi_nz, gcz, k, size_z);
+					pnz = par_local_size(mpi_nz, gcz, k, size_z);
+
+					if ((i == rank_x) && (j == rank_y) && (k == rank_z))
+					{
+						get_sub_array<memIN, memOUT>(in, mpi_nx, mpi_ny, mpi_nz,
+							posx, posx + pnx - 1, posy, posy + pny - 1, posz, posz + pnz - 1,
+							out);
+						continue;
+					}
+
+					// copy to temporary memory
+					allocate_memory(&exch, &exch_size, pnx * pny * pnz * sizeof(T));
+					mem = (T*)exch;
+
+					get_sub_array<memIN, memCPU>(in, mpi_nx, mpi_ny, mpi_nz,
+						posx, posx + pnx - 1, posy, posy + pny - 1, posz, posz + pnz - 1,
+						mem);
+
+					prank = rank_id(i, j, k);
+					MPI_Send(mem, pnx * pny * pnz, mpi_type< T >(), prank, 0, comm);
+				}
+			}
+		}
+	}
+	else
+	{
+		mpi_recv<memOUT>(out, nx * ny * nz, host, 0, comm, MPI_STATUS_IGNORE);
+	}
+
+	MPI_Barrier(comm);
+}
+
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_subgrid(T* out, const T* in,
+	const int nx, const int ny, const int nz,
+	const int sub_nx, const int sub_ny, const int sub_nz,
+	const int gcx, const int gcy, const int gcz) const
+{
+	// gather array: in[sub_nx, sub_ny, sub_nz] -> out[nx, ny, nz] 
+	// * non-overlapping messages
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	const int modx = rank_x % 2, mody = rank_y % 2, modz = rank_z % 2;
+
+	const int shx = (rank_x < size_x - 1) ? gcx : 0;
+	const int shy = (rank_y < size_y - 1) ? gcy : 0;
+	const int shz = (rank_z < size_z - 1) ? gcz : 0;
+
+	if ((!modx) && (!mody) && (!modz))
+	{
+		const int ib[2] = { 0, sub_nx - gcx };
+		const int jb[2] = { 0, sub_ny - gcy };
+		const int kb[2] = { 0, sub_nz - gcz };
+		const int ie[2] = { sub_nx - shx - 1, nx - 1 };
+		const int je[2] = { sub_ny - shy - 1, ny - 1 };
+		const int ke[2] = { sub_nz - shz - 1, nz - 1 };
+
+		const int isz[2] = { ie[0] - ib[0] + 1, ie[1] - ib[1] + 1 };
+		const int jsz[2] = { je[0] - jb[0] + 1, je[1] - jb[1] + 1 };
+		const int ksz[2] = { ke[0] - kb[0] + 1, ke[1] - kb[1] + 1 };
+
+		const int plimx = (rank_x < size_x - 1) ? 1 : 0;
+		const int plimy = (rank_y < size_y - 1) ? 1 : 0;
+		const int plimz = (rank_z < size_z - 1) ? 1 : 0;
+
+		int px, py, pz, midx, msize, prank;
+		T* mrecv[8];
+		MPI_Request mpi_req[8];
+
+#pragma omp master
+		for (int k = 0; k < 8; k++)
+			mpi_req[k] = MPI_REQUEST_NULL;
+
+
+		for (px = 0; px <= plimx; px++)
+			for (py = 0; py <= plimy; py++)
+				for (pz = 0; pz <= plimz; pz++)
+				{
+					midx = 4 * px + 2 * py + pz;
+					msize = isz[px] * jsz[py] * ksz[pz];
+
+					// memory allocation on demand - thread safe //
+					allocate_memory(&exch_sp[midx], &exch_size_sp[midx], sizeof(T)* msize);
+					mrecv[midx] = (T*)exch_sp[midx];
+
+					if (midx == 0) { // out <-- in may fail as pointers may overlap
+						get_sub_array<memIN, memCPU>(in, sub_nx, sub_ny, sub_nz,
+							ib[px], ie[px], jb[py], je[py], kb[pz], ke[pz],
+							mrecv[midx]);
+						continue;
+					}
+
+					prank = rank_id(rank_x + px, rank_y + py, rank_z + pz);
+#pragma omp master
+					MPI_Irecv(mrecv[midx], msize, mpi_type< T >(), prank,
+						0, comm, &mpi_req[midx]);
+				}
+
+#pragma omp master
+		MPI_Waitall(8, mpi_req, MPI_STATUSES_IGNORE);
+#pragma omp barrier
+
+		for (px = 0; px <= plimx; px++)
+			for (py = 0; py <= plimy; py++)
+				for (pz = 0; pz <= plimz; pz++)
+				{
+					midx = 4 * px + 2 * py + pz;
+					put_sub_array<memOUT, memCPU>(out, nx, ny, nz,
+						ib[px], ie[px], jb[py], je[py], kb[pz], ke[pz],
+						mrecv[midx]);
+				}
+	}
+	else
+	{
+		const int ib = modx ? gcx : 0;
+		const int ie = modx ? sub_nx - 1 : sub_nx - shx - 1;
+		const int jb = mody ? gcy : 0;
+		const int je = mody ? sub_ny - 1 : sub_ny - shy - 1;
+		const int kb = modz ? gcz : 0;
+		const int ke = modz ? sub_nz - 1 : sub_nz - shz - 1;
+
+		int msize, prank;
+		T *msend;
+
+		msize = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);
+
+		// memory allocation on demand - thread safe //
+		allocate_memory(&exch, &exch_size, sizeof(T)* msize);
+		msend = (T*)exch;
+
+		get_sub_array<memIN, memCPU>(in, sub_nx, sub_ny, sub_nz,
+			ib, ie, jb, je, kb, ke, msend);
+
+		prank = rank_id(rank_x - modx, rank_y - mody, rank_z - modz);
+#pragma omp barrier
+#pragma omp master
+		MPI_Send(msend, msize, mpi_type< T >(), prank, 0, comm);
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::scatter_subgrid(T* out, const T* in,
+	const int nx, const int ny, const int nz,
+	const int sub_nx, const int sub_ny, const int sub_nz,
+	const int gcx, const int gcy, const int gcz) const
+{
+	// scatter array: in[nx, ny, nz] -> out[sub_nx, sub_ny, sub_nz] 
+	// * including ghost cells
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	const int modx = rank_x % 2, mody = rank_y % 2, modz = rank_z % 2;
+
+	if ((!modx) && (!mody) && (!modz))
+	{
+		const int ib[2] = { 0, sub_nx - 2 * gcx };
+		const int jb[2] = { 0, sub_ny - 2 * gcy };
+		const int kb[2] = { 0, sub_nz - 2 * gcz };
+		const int ie[2] = { sub_nx - 1, nx - 1 };
+		const int je[2] = { sub_ny - 1, ny - 1 };
+		const int ke[2] = { sub_nz - 1, nz - 1 };
+
+		const int isz[2] = { ie[0] - ib[0] + 1, ie[1] - ib[1] + 1 };
+		const int jsz[2] = { je[0] - jb[0] + 1, je[1] - jb[1] + 1 };
+		const int ksz[2] = { ke[0] - kb[0] + 1, ke[1] - kb[1] + 1 };
+
+		const int plimx = (rank_x < size_x - 1) ? 1 : 0;
+		const int plimy = (rank_y < size_y - 1) ? 1 : 0;
+		const int plimz = (rank_z < size_z - 1) ? 1 : 0;
+
+		int px, py, pz, midx, msize[8], prank;
+		T *msend[8];
+		MPI_Request mpi_req[8];
+
+#pragma omp master
+		for (int k = 0; k < 8; k++)
+			mpi_req[k] = MPI_REQUEST_NULL;
+
+		for (px = 0; px <= plimx; px++)
+			for (py = 0; py <= plimy; py++)
+				for (pz = 0; pz <= plimz; pz++)
+				{
+					midx = 4 * px + 2 * py + pz;
+					msize[midx] = isz[px] * jsz[py] * ksz[pz];
+
+					// memory allocation on demand - thread safe //
+					allocate_memory(&exch_sp[midx], &exch_size_sp[midx], sizeof(T)* msize[midx]);
+					msend[midx] = (T*)exch_sp[midx];
+
+					get_sub_array<memIN, memCPU>(in, nx, ny, nz,
+						ib[px], ie[px], jb[py], je[py], kb[pz], ke[pz],
+						msend[midx]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+					if (midx == 0) continue;
+
+					prank = rank_id(rank_x + px, rank_y + py, rank_z + pz);
+#pragma omp barrier
+#pragma omp master
+					MPI_Isend(msend[midx], msize[midx], mpi_type< T >(), prank,
+						0, comm, &mpi_req[midx]);
+#endif
+				}
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+		if ((plimx == 0) && (plimy == 0) && (plimz == 0)) {
+#pragma omp barrier
+		}
+		mcopy<memOUT, memCPU>(out, msend[0], msize[0]);	// out <-- in may fail as pointers may overlap
+#else
+#pragma omp barrier
+
+		for (px = 0; px <= plimx; px++)
+			for (py = 0; py <= plimy; py++)
+				for (pz = 0; pz <= plimz; pz++)
+				{
+					midx = 4 * px + 2 * py + pz;
+
+					if (midx == 0) {	// out <-- in may fail as pointers may overlap
+						mcopy<memOUT, memCPU>(out, msend[midx], msize[midx]);
+						continue;
+					}
+
+					prank = rank_id(rank_x + px, rank_y + py, rank_z + pz);
+#pragma omp master
+					MPI_Isend(msend[midx], msize[midx], mpi_type< T >(), prank,
+						0, comm, &mpi_req[midx]);
+
+				}
+#endif
+
+#pragma omp master
+		MPI_Waitall(8, mpi_req, MPI_STATUSES_IGNORE);
+	}
+	else
+	{
+		int msize, prank;
+
+		msize = sub_nx * sub_ny * sub_nz;
+		prank = rank_id(rank_x - modx, rank_y - mody, rank_z - modz);
+#pragma omp master
+		mpi_recv<memOUT>(out, msize, prank, 0, comm, MPI_STATUS_IGNORE);
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_subgrid_x(T* out, const T* in,
+	const int nx, const int sub_nx, const int gcx) const
+{
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	if ((rank_y % 2 == 0) && (rank_z % 2 == 0))
+	{
+		gather_subgrid_line<memOUT, memIN>(out, in, nx, sub_nx, gcx,
+			rank_x, size_x, comm_x);
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_x);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_subgrid_y(T* out, const T* in,
+	const int ny, const int sub_ny, const int gcy) const
+{
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	if ((rank_x % 2 == 0) && (rank_z % 2 == 0))
+	{
+		gather_subgrid_line<memOUT, memIN>(out, in, ny, sub_ny, gcy,
+			rank_y, size_y, comm_y);
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_y);
+}
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_subgrid_z(T* out, const T* in,
+	const int nz, const int sub_nz, const int gcz) const
+{
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	if ((rank_x % 2 == 0) && (rank_y % 2 == 0))
+	{
+		gather_subgrid_line<memOUT, memIN>(out, in, nz, sub_nz, gcz,
+			rank_z, size_z, comm_z);
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_z);
+}
+
+
+template< nse::memType memOUT, nse::memType memIN, typename T >
+void nse::mpiCom3d::gather_subgrid_line(T* out, const T* in,
+	const int nx, const int sub_nx, const int gcx,
+	const int prank, const int psize, const MPI_Comm pcomm) const
+{
+	const int modx = prank % 2;
+	const int shx = (prank < psize - 1) ? gcx : 0;
+
+	if (!modx)
+	{
+		const int ib[2] = { 0, sub_nx - gcx };
+		const int ie[2] = { sub_nx - shx - 1, nx - 1 };
+		const int isz[2] = { ie[0] - ib[0] + 1, ie[1] - ib[1] + 1 };
+
+		const int plimx = (prank < psize - 1) ? 1 : 0;
+
+		int px;
+		MPI_Request mpi_req[2] = { MPI_REQUEST_NULL, MPI_REQUEST_NULL };
+		T* mrecv[2];
+
+		for (px = 0; px <= plimx; px++)
+		{
+			// memory allocation on demand - thread safe //
+			allocate_memory(&exch_sp[px], &exch_size_sp[px], sizeof(T)* isz[px]);
+			mrecv[px] = (T*)exch_sp[px];
+
+			if (px == 0) {	// out <-- in may fail as pointers may overlap
+				mcopy<memCPU, memIN>(mrecv[px], &in[ib[px]], isz[px]);
+				continue;
+			}
+
+#pragma omp master
+			MPI_Irecv(mrecv[px], isz[px], mpi_type< T >(), prank + px,
+				0, pcomm, &mpi_req[px]);
+		}
+
+#pragma omp master
+		MPI_Wait(&mpi_req[1], MPI_STATUS_IGNORE);
+#pragma omp barrier
+
+		for (px = 0; px <= plimx; px++) {
+			mcopy<memOUT, memCPU>(&out[ib[px]], mrecv[px], isz[px]);
+		}
+	}
+	else
+	{
+		int isz = sub_nx - gcx;
+
+		// memory allocation on demand - thread safe //
+		allocate_memory(&exch, &exch_size, sizeof(T)* isz);
+		T *msend = (T*)exch;
+
+		mcopy<memCPU, memIN>(msend, &in[gcx], isz);
+#pragma omp barrier
+#pragma omp master
+		MPI_Send(msend, isz, mpi_type< T >(), prank - 1, 0, pcomm);
+	}
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::exchange_cross_halo(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z) const
+{
+	// degenerate case //
+	if ((size == 1) &&
+		(!periodic_x) && (!periodic_y) && (!periodic_z)) return;
+
+	MPI_Request mpi_req[12];
+
+	push_exchange_cross_halo<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, periodic_y, periodic_z, mpi_req);
+	pop_exchange_cross_halo<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, periodic_y, periodic_z, mpi_req);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::exchange_halo(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z) const
+{
+	// degenerate case //
+	if ((size == 1) &&
+		(!periodic_x) && (!periodic_y) && (!periodic_z)) return;
+
+	MPI_Request mpi_req[4];
+
+	push_exchange_halo_x<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, mpi_req);
+	pop_exchange_halo_x<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, mpi_req);
+
+	push_exchange_halo_y<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_y, mpi_req);
+	pop_exchange_halo_y<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_y, mpi_req);
+
+	push_exchange_halo_z<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_z, mpi_req);
+	pop_exchange_halo_z<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_z, mpi_req);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::exchange_halo(
+	T* x, T* y,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z) const
+{
+	exchange_halo(x, nx, ny, nz, gcx, gcy, gcz, 
+		hx, hy, hz, periodic_x, periodic_y, periodic_z);
+	exchange_halo(y, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, periodic_y, periodic_z);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::exchange_cross_halo(
+	T* u, T* v, T* w,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z) const
+{
+	exchange_cross_halo<mem>(u, nx, ny, nz, gcx, gcy, gcz, 
+		hx, hy, hz, periodic_x, periodic_y, periodic_z);
+	exchange_cross_halo<mem>(v, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, periodic_y, periodic_z);
+	exchange_cross_halo<mem>(w, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, periodic_y, periodic_z);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::exchange_halo(
+	T* u, T* v, T* w,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z) const
+{
+	// degenerate case //
+	if ((size == 1) &&
+		(!periodic_x) && (!periodic_y) && (!periodic_z)) return;
+
+	MPI_Request mpi_req[12];
+
+	// initialize -x[u], -y[v], -z[w] //
+	push_exchange_halo_x<mem>(u, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_x, mpi_req);
+	push_exchange_halo_y<mem>(v, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_y, &mpi_req[4]);
+	push_exchange_halo_z<mem>(w, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_z, &mpi_req[8]);
+
+	// finalize -x[u], -y[v] //
+	pop_exchange_halo_x<mem>(u, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_x, mpi_req);
+	pop_exchange_halo_y<mem>(v, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_y, &mpi_req[4]);
+
+	// initialize -y[u] //
+	push_exchange_halo_y<mem>(u, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+
+	// finalize -z[w] //
+	pop_exchange_halo_z<mem>(w, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_z, &mpi_req[8]);
+
+	// initialize -z[v], -x[w] //
+	push_exchange_halo_z<mem>(v, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_z, &mpi_req[4]);
+	push_exchange_halo_x<mem>(w, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_x, &mpi_req[8]);
+
+	// finalize -y[u], -z[v] //
+	pop_exchange_halo_y<mem>(u, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+	pop_exchange_halo_z<mem>(v, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_z, &mpi_req[4]);
+
+	// initialize -z[u] //
+	push_exchange_halo_z<mem>(u, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+
+	// finalize -x[w] //
+	pop_exchange_halo_x<mem>(w, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_x, &mpi_req[8]);
+
+	// initialize -x[v], -y[w] //
+	push_exchange_halo_x<mem>(v, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_x, &mpi_req[4]);
+	push_exchange_halo_y<mem>(w, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_y, &mpi_req[8]);
+
+	// finalize -z[u], -x[v], -y[w] //
+	pop_exchange_halo_z<mem>(u, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+	pop_exchange_halo_x<mem>(v, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_x, &mpi_req[4]);
+	pop_exchange_halo_y<mem>(w, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_y, &mpi_req[8]);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::exchange_color_halo(
+	T* x,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z) const
+{
+	// degenerate case //
+	if ((size == 1) &&
+		(!periodic_x) && (!periodic_y) && (!periodic_z)) return;
+
+	MPI_Request mpi_req[4];
+
+	push_exchange_color_halo_x<mem>(x, color, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, mpi_req);
+	pop_exchange_color_halo_x<mem>(x, color, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, mpi_req);
+
+	push_exchange_color_halo_y<mem>(x, color, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_y, mpi_req);
+	pop_exchange_color_halo_y<mem>(x, color, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_y, mpi_req);
+
+	push_exchange_color_halo_z<mem>(x, color, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_z, mpi_req);
+	pop_exchange_color_halo_z<mem>(x, color, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_z, mpi_req);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::exchange_halo_x(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x) const
+{
+	if ((size_x == 1) && (!periodic_x)) return;	// degenerate case //
+
+	MPI_Request mpi_req[4];
+
+	push_exchange_halo_x<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, mpi_req);
+	pop_exchange_halo_x<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_x, mpi_req);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::exchange_halo_y(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_y) const
+{
+	if ((size_y == 1) && (!periodic_y)) return;	// degenerate case //
+
+	MPI_Request mpi_req[4];
+
+	push_exchange_halo_y<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_y, mpi_req);
+	pop_exchange_halo_y<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_y, mpi_req);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::exchange_halo_z(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_z) const
+{
+	if ((size_z == 1) && (!periodic_z)) return;	// degenerate case //
+
+	MPI_Request mpi_req[4];
+
+	push_exchange_halo_z<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_z, mpi_req);
+	pop_exchange_halo_z<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, hy, hz, periodic_z, mpi_req);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::push_exchange_cross_halo(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z,
+	MPI_Request mpi_req[12]) const
+{
+	if ((size == 1) &&
+		(!periodic_x) && (!periodic_y) && (!periodic_z)) // degenerate case //
+	{
+#pragma omp master
+		for (int k = 0; k < 12; k++)
+			mpi_req[k] = MPI_REQUEST_NULL;
+		return;
+	}
+
+	// initialize -x, -y, -z cross exchanges //
+	push_exchange_halo_x<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, 0, 0, periodic_x, &mpi_req[0]);
+	push_exchange_halo_y<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		0, hy, 0, periodic_y, &mpi_req[4]);
+	push_exchange_halo_z<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 0, hz, periodic_z, &mpi_req[8]);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::pop_exchange_cross_halo(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z,
+	MPI_Request mpi_req[12]) const
+{
+	if (size == 1) return;	// degenerate case //
+
+							// finalize -x, -y, -z cross exchanges //
+	pop_exchange_halo_x<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		hx, 0, 0, periodic_x, &mpi_req[0]);
+	pop_exchange_halo_y<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		0, hy, 0, periodic_y, &mpi_req[4]);
+	pop_exchange_halo_z<mem>(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 0, hz, periodic_z, &mpi_req[8]);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::push_exchange_halo_x(
+	T* x,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x,
+
+	MPI_Request mpi_req[4]) const
+{
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+
+	if (hx == 0) return;	// exit on zero size message //
+	if (size_x == 1) {		// degenerate case //
+
+		if (periodic_x) {	// apply periodicity within local array
+			apply_periodic_x<mem>(x, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+
+	const int is_west_exch = ((rank_x > 0) ||
+		((rank_x == 0) && (periodic_x)));
+	const int is_east_exch = ((rank_x < size_x - 1) ||
+		((rank_x == size_x - 1) && (periodic_x)));
+
+	const int msx = hx *
+		(ny - ((gcy - hy) << 1)) * (nz - ((gcz - hz) << 1));    // message size in -x communicator
+
+																// initialize -x MPI exchanges //
+																// --------------------------- //
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+	if ((mem == memCPU) && (msx > _MPI_EXCH3D_SUBARRAY_MIN_SIZE)) {
+
+#pragma omp master
+		{
+			if (is_west_exch)	// west halo exchange //
+			{
+				MPI_Datatype xsub[2];
+				int pidx = (rank_x > 0) ? rank_x - 1 : size_x - 1;
+
+				get_subarray< T >(&xsub[0], nx, ny, nz,	// west receive //
+					gcx - hx, gcx - 1,
+					gcy - hy, ny - gcy + hy - 1,
+					gcz - hz, nz - gcz + hz - 1);
+				MPI_Irecv(x, 1, xsub[0], pidx, 1, comm_x, &mpi_req[0]);
+
+				get_subarray< T >(&xsub[1], nx, ny, nz,	// west send //
+					gcx, gcx + hx - 1,
+					gcy - hy, ny - gcy + hy - 1,
+					gcz - hz, nz - gcz + hz - 1);
+				MPI_Isend(x, 1, xsub[1], pidx, 0, comm_x, &mpi_req[2]);
+			}
+			if (is_east_exch)	// east halo exchange //
+			{
+				MPI_Datatype xsub[2];
+				int pidx = (rank_x < size_x - 1) ? rank_x + 1 : 0;
+
+				get_subarray< T >(&xsub[0], nx, ny, nz,	// east receive //
+					nx - gcx, nx - gcx + hx - 1,
+					gcy - hy, ny - gcy + hy - 1,
+					gcz - hz, nz - gcz + hz - 1);
+				MPI_Irecv(x, 1, xsub[0], pidx, 0, comm_x, &mpi_req[1]);
+
+				get_subarray< T >(&xsub[1], nx, ny, nz,	// east send //
+					nx - gcx - hx, nx - gcx - 1,
+					gcy - hy, ny - gcy + hy - 1,
+					gcz - hz, nz - gcz + hz - 1);
+				MPI_Isend(x, 1, xsub[1], pidx, 1, comm_x, &mpi_req[3]);
+			}
+		}
+	}
+	else
+#endif
+	{
+		T *(msend_x[2]), *(mrecv_x[2]);          // -x message pointers //
+		for (int k = 0; k < 2; k++) {
+			msend_x[k] = NULL; mrecv_x[k] = NULL;
+		}
+
+		// memory allocation on demand - thread safe //
+		allocate_memory(&exch_x, &exch_size_x, 4 * sizeof(T)* msx);
+
+		if (is_west_exch)	// west halo exchange-recv //
+		{
+			int pidx = (rank_x > 0) ? rank_x - 1 : size_x - 1;
+
+			mrecv_x[0] = (T*)exch_x;
+			msend_x[0] = &((T*)exch_x)[(msx << 1)];
+
+#pragma omp master
+			MPI_Irecv(mrecv_x[0], msx, mpi_type< T >(), pidx, 1, comm_x, &mpi_req[0]);
+
+			get_sub_array<mem, memCPU>(x, nx, ny, nz,
+				gcx, gcx + hx - 1,
+				gcy - hy, ny - gcy + hy - 1,
+				gcz - hz, nz - gcz + hz - 1,
+				msend_x[0]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+			MPI_Isend(msend_x[0], msx, mpi_type< T >(), pidx, 0, comm_x, &mpi_req[2]);
+#endif
+		}
+		if (is_east_exch)	// east halo exchange-recv //
+		{
+			int pidx = (rank_x < size_x - 1) ? rank_x + 1 : 0;
+
+			mrecv_x[1] = &((T*)exch_x)[msx];
+			msend_x[1] = &((T*)exch_x)[(msx << 1) + msx];
+
+#pragma omp master
+			MPI_Irecv(mrecv_x[1], msx, mpi_type< T >(), pidx, 0, comm_x, &mpi_req[1]);
+
+			get_sub_array<mem, memCPU>(x, nx, ny, nz,
+				nx - gcx - hx, nx - gcx - 1,
+				gcy - hy, ny - gcy + hy - 1,
+				gcz - hz, nz - gcz + hz - 1,
+				msend_x[1]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+			MPI_Isend(msend_x[1], msx, mpi_type< T >(), pidx, 1, comm_x, &mpi_req[3]);
+#endif
+		}
+
+#ifndef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+
+		if (is_west_exch)	// west halo exchange-send //
+		{
+			int pidx = (rank_x > 0) ? rank_x - 1 : size_x - 1;
+
+#pragma omp master
+			MPI_Isend(msend_x[0], msx, mpi_type< T >(), pidx, 0, comm_x, &mpi_req[2]);
+		}
+		if (is_east_exch)	// east halo exchange-send //
+		{
+			int pidx = (rank_x < size_x - 1) ? rank_x + 1 : 0;
+
+#pragma omp master
+			MPI_Isend(msend_x[1], msx, mpi_type< T >(), pidx, 1, comm_x, &mpi_req[3]);
+		}
+#endif
+
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_x);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::pop_exchange_halo_x(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x,
+
+	MPI_Request mpi_req[4]) const
+{
+	if (hx == 0) return;	// exit on zero size message //
+	if (size_x == 1) {		// degenerate case //
+
+		if (periodic_x) {	// apply periodicity within local array
+#pragma omp barrier
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	// finalize -x MPI exchanges //
+	// ------------------------- //
+#pragma omp master
+	MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+#pragma omp barrier
+
+	const int msx = hx *
+		(ny - ((gcy - hy) << 1)) * (nz - ((gcz - hz) << 1));    // message size in -x communicator
+
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+	if ((mem == memGPU) || (msx <= _MPI_EXCH3D_SUBARRAY_MIN_SIZE))
+#endif
+	{
+		T *mrecv_x[2];	// -x message pointers //
+						// west halo exchange //
+		if ((rank_x > 0) || ((rank_x == 0) && (periodic_x)))
+		{
+			mrecv_x[0] = (T*)exch_x;
+			put_sub_array<mem, memCPU>(x, nx, ny, nz,
+				gcx - hx, gcx - 1,
+				gcy - hy, ny - gcy + hy - 1,
+				gcz - hz, nz - gcz + hz - 1,
+				mrecv_x[0]);
+		}
+		// east halo exchange //
+		if ((rank_x < size_x - 1) || ((rank_x == size_x - 1) && (periodic_x)))
+		{
+			mrecv_x[1] = &((T*)exch_x)[msx];
+			put_sub_array<mem, memCPU>(x, nx, ny, nz,
+				nx - gcx, nx - gcx + hx - 1,
+				gcy - hy, ny - gcy + hy - 1,
+				gcz - hz, nz - gcz + hz - 1,
+				mrecv_x[1]);
+		}
+
+#pragma omp barrier
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_x);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::push_exchange_halo_y(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_y,
+
+	MPI_Request mpi_req[4]) const
+{
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+
+	if (hy == 0) return;	// exit on zero size message //
+	if (size_y == 1) {		// degenerate case //
+
+		if (periodic_y) {	// apply periodicity withing local array
+			apply_periodic_y<mem>(x, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	const int is_south_exch = ((rank_y > 0) ||
+		((rank_y == 0) && (periodic_y)));
+	const int is_north_exch = ((rank_y < size_y - 1) ||
+		((rank_y == size_y - 1) && (periodic_y)));
+
+	const int msy = hy *
+		(nx - ((gcx - hx) << 1)) * (nz - ((gcz - hz) << 1));    // message size in -y communicator
+
+																// initialize -y MPI exchanges //
+																// --------------------------- //
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+	if ((mem == memCPU) && (msy > _MPI_EXCH3D_SUBARRAY_MIN_SIZE)) {
+
+#pragma omp master
+		{
+			if (is_south_exch)	// south halo exchange //
+			{
+				MPI_Datatype ysub[2];
+				int pidx = (rank_y > 0) ? rank_y - 1 : size_y - 1;
+
+				get_subarray< T >(&ysub[0], nx, ny, nz,	// south receive //
+					gcx - hx, nx - gcx + hx - 1,
+					gcy - hy, gcy - 1,
+					gcz - hz, nz - gcz + hz - 1);
+				MPI_Irecv(x, 1, ysub[0], pidx, 1, comm_y, &mpi_req[0]);
+
+				get_subarray< T >(&ysub[1], nx, ny, nz,	// south send //
+					gcx - hx, nx - gcx + hx - 1,
+					gcy, gcy + hy - 1,
+					gcz - hz, nz - gcz + hz - 1);
+				MPI_Isend(x, 1, ysub[1], pidx, 0, comm_y, &mpi_req[2]);
+			}
+			if (is_north_exch)	// north halo exchange //
+			{
+				MPI_Datatype ysub[2];
+				int pidx = (rank_y < size_y - 1) ? rank_y + 1 : 0;
+
+				get_subarray< T >(&ysub[0], nx, ny, nz,	// north receive //
+					gcx - hx, nx - gcx + hx - 1,
+					ny - gcy, ny - gcy + hy - 1,
+					gcz - hz, nz - gcz + hz - 1);
+				MPI_Irecv(x, 1, ysub[0], pidx, 0, comm_y, &mpi_req[1]);
+
+				get_subarray< T >(&ysub[1], nx, ny, nz,	// north send //
+					gcx - hx, nx - gcx + hx - 1,
+					ny - gcy - hy, ny - gcy - 1,
+					gcz - hz, nz - gcz + hz - 1);
+				MPI_Isend(x, 1, ysub[1], pidx, 1, comm_y, &mpi_req[3]);
+			}
+		}
+	}
+	else
+#endif
+	{
+		T *(msend_y[2]), *(mrecv_y[2]);          // -y message pointers //
+		for (int k = 0; k < 2; k++) {
+			msend_y[k] = NULL; mrecv_y[k] = NULL;
+		}
+
+		// memory allocation on demand - thread safe //
+		allocate_memory(&exch_y, &exch_size_y, 4 * sizeof(T)* msy);
+
+		if (is_south_exch)	// south halo exchange-recv //
+		{
+			int pidx = (rank_y > 0) ? rank_y - 1 : size_y - 1;
+
+			mrecv_y[0] = (T*)exch_y;
+			msend_y[0] = &((T*)exch_y)[(msy << 1)];
+
+#pragma omp master
+			MPI_Irecv(mrecv_y[0], msy, mpi_type< T >(), pidx, 1, comm_y, &mpi_req[0]);
+
+			get_sub_array<mem, memCPU>(x, nx, ny, nz,
+				gcx - hx, nx - gcx + hx - 1,
+				gcy, gcy + hy - 1,
+				gcz - hz, nz - gcz + hz - 1,
+				msend_y[0]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+			MPI_Isend(msend_y[0], msy, mpi_type< T >(), pidx, 0, comm_y, &mpi_req[2]);
+#endif
+		}
+		if (is_north_exch)	// north halo exchange-recv //
+		{
+			int pidx = (rank_y < size_y - 1) ? rank_y + 1 : 0;
+
+			mrecv_y[1] = &((T*)exch_y)[msy];
+			msend_y[1] = &((T*)exch_y)[(msy << 1) + msy];
+
+#pragma omp master
+			MPI_Irecv(mrecv_y[1], msy, mpi_type< T >(), pidx, 0, comm_y, &mpi_req[1]);
+
+			get_sub_array<mem, memCPU>(x, nx, ny, nz,
+				gcx - hx, nx - gcx + hx - 1,
+				ny - gcy - hy, ny - gcy - 1,
+				gcz - hz, nz - gcz + hz - 1,
+				msend_y[1]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+			MPI_Isend(msend_y[1], msy, mpi_type< T >(), pidx, 1, comm_y, &mpi_req[3]);
+#endif
+		}
+
+#ifndef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+
+		if (is_south_exch)	// south halo exchange-send //
+		{
+			int pidx = (rank_y > 0) ? rank_y - 1 : size_y - 1;
+
+#pragma omp master
+			MPI_Isend(msend_y[0], msy, mpi_type< T >(), pidx, 0, comm_y, &mpi_req[2]);
+		}
+		if (is_north_exch)	// north halo exchange-send //
+		{
+			int pidx = (rank_y < size_y - 1) ? rank_y + 1 : 0;
+
+#pragma omp master
+			MPI_Isend(msend_y[1], msy, mpi_type< T >(), pidx, 1, comm_y, &mpi_req[3]);
+		}
+#endif
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_y);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::pop_exchange_halo_y(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_y,
+
+	MPI_Request mpi_req[4]) const
+{
+	if (hy == 0) return;		// exit on zero size message //
+	if (size_y == 1) {			// degenerate case //
+
+		if (periodic_y) {	// apply periodicity withing local array
+#pragma omp barrier
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	// finalize -y MPI exchanges //
+	// ------------------------- //
+#pragma omp master
+	MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+#pragma omp barrier
+
+	const int msy = hy *
+		(nx - ((gcx - hx) << 1)) * (nz - ((gcz - hz) << 1));    // message size in -y communicator 
+
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+	if ((mem == memGPU) || (msy <= _MPI_EXCH3D_SUBARRAY_MIN_SIZE))
+#endif
+	{
+		T *(mrecv_y[2]);	// -y message pointers //
+							// south halo exchange //
+		if ((rank_y > 0) || ((rank_y == 0) && (periodic_y)))
+		{
+			mrecv_y[0] = (T*)exch_y;
+			put_sub_array<mem, memCPU>(x, nx, ny, nz,
+				gcx - hx, nx - gcx + hx - 1,
+				gcy - hy, gcy - 1,
+				gcz - hz, nz - gcz + hz - 1,
+				mrecv_y[0]);
+		}
+		// north halo exchange //
+		if ((rank_y < size_y - 1) || ((rank_y == size_y - 1) && (periodic_y)))
+		{
+			mrecv_y[1] = &((T*)exch_y)[msy];
+			put_sub_array<mem, memCPU>(x, nx, ny, nz,
+				gcx - hx, nx - gcx + hx - 1,
+				ny - gcy, ny - gcy + hy - 1,
+				gcz - hz, nz - gcz + hz - 1,
+				mrecv_y[1]);
+		}
+
+#pragma omp barrier
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_y);
+}
+
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::push_exchange_halo_z(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_z,
+
+	MPI_Request mpi_req[4]) const
+{
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+
+	if (hz == 0) return;	// exit on zero size message //
+	if (size_z == 1) {		// degenerate case //
+
+		if (periodic_z) {	// apply periodicity withing local array
+			apply_periodic_z<mem>(x, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	const int is_bottom_exch = ((rank_z > 0) ||
+		((rank_z == 0) && (periodic_z)));
+	const int is_top_exch = ((rank_z < size_z - 1) ||
+		((rank_z == size_z - 1) && (periodic_z)));
+
+	const int msz = hz *
+		(nx - ((gcx - hx) << 1)) * (ny - ((gcy - hy) << 1));    // message size in -z communicator 
+
+																// initialize -z MPI exchanges //
+																// --------------------------- //
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+	if ((mem == memCPU) && (msz > _MPI_EXCH3D_SUBARRAY_MIN_SIZE)) {
+
+#pragma omp master
+		{
+			if (is_bottom_exch)	// bottom halo exchange //
+			{
+				MPI_Datatype zsub[2];
+				int pidx = (rank_z > 0) ? rank_z - 1 : size_z - 1;
+
+				get_subarray< T >(&zsub[0], nx, ny, nz,	// bottom receive //
+					gcx - hx, nx - gcx + hx - 1,
+					gcy - hy, ny - gcy + hy - 1,
+					gcz - hz, gcz - 1);
+				MPI_Irecv(x, 1, zsub[0], pidx, 1, comm_z, &mpi_req[0]);
+
+				get_subarray< T >(&zsub[1], nx, ny, nz,	// bottom send //
+					gcx - hx, nx - gcx + hx - 1,
+					gcy - hy, ny - gcy + hy - 1,
+					gcz, gcz + hz - 1);
+				MPI_Isend(x, 1, zsub[1], pidx, 0, comm_z, &mpi_req[2]);
+			}
+			if (is_top_exch)	// top halo exchange //
+			{
+				MPI_Datatype zsub[2];
+				int pidx = (rank_z < size_z - 1) ? rank_z + 1 : 0;
+
+				get_subarray< T >(&zsub[0], nx, ny, nz,	// top receive //
+					gcx - hx, nx - gcx + hx - 1,
+					gcy - hy, ny - gcy + hy - 1,
+					nz - gcz, nz - gcz + hz - 1);
+				MPI_Irecv(x, 1, zsub[0], pidx, 0, comm_z, &mpi_req[1]);
+
+				get_subarray< T >(&zsub[1], nx, ny, nz,	// top send //
+					gcx - hx, nx - gcx + hx - 1,
+					gcy - hy, ny - gcy + hy - 1,
+					nz - gcz - hz, nz - gcz - 1);
+				MPI_Isend(x, 1, zsub[1], pidx, 1, comm_z, &mpi_req[3]);
+			}
+		}
+	}
+	else
+#endif
+	{
+		T *(msend_z[2]), *(mrecv_z[2]);          // -z message pointers //
+		for (int k = 0; k < 2; k++) {
+			msend_z[k] = NULL; mrecv_z[k] = NULL;
+		}
+
+		// memory allocation on demand - thread safe //
+		allocate_memory(&exch_z, &exch_size_z, 4 * sizeof(T)* msz);
+
+		if (is_bottom_exch)	// bottom halo exchange-recv //
+		{
+			int pidx = (rank_z > 0) ? rank_z - 1 : size_z - 1;
+
+			mrecv_z[0] = (T*)exch_z;
+			msend_z[0] = &((T*)exch_z)[(msz << 1)];
+
+#pragma omp master
+			MPI_Irecv(mrecv_z[0], msz, mpi_type< T >(), pidx, 1, comm_z, &mpi_req[0]);
+
+			get_sub_array<mem, memCPU>(x, nx, ny, nz,
+				gcx - hx, nx - gcx + hx - 1,
+				gcy - hy, ny - gcy + hy - 1,
+				gcz, gcz + hz - 1,
+				msend_z[0]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+			MPI_Isend(msend_z[0], msz, mpi_type< T >(), pidx, 0, comm_z, &mpi_req[2]);
+#endif
+		}
+		if (is_top_exch)	// top halo exchange-recv //
+		{
+			int pidx = (rank_z < size_z - 1) ? rank_z + 1 : 0;
+
+			mrecv_z[1] = &((T*)exch_z)[msz];
+			msend_z[1] = &((T*)exch_z)[(msz << 1) + msz];
+
+#pragma omp master
+			MPI_Irecv(mrecv_z[1], msz, mpi_type< T >(), pidx, 0, comm_z, &mpi_req[1]);
+
+			get_sub_array<mem, memCPU>(x, nx, ny, nz,
+				gcx - hx, nx - gcx + hx - 1,
+				gcy - hy, ny - gcy + hy - 1,
+				nz - gcz - hz, nz - gcz - 1,
+				msend_z[1]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+			MPI_Isend(msend_z[1], msz, mpi_type< T >(), pidx, 1, comm_z, &mpi_req[3]);
+#endif
+		}
+
+#ifndef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+
+		if (is_bottom_exch)	// bottom halo exchange-send //
+		{
+			int pidx = (rank_z > 0) ? rank_z - 1 : size_z - 1;
+
+#pragma omp master
+			MPI_Isend(msend_z[0], msz, mpi_type< T >(), pidx, 0, comm_z, &mpi_req[2]);
+		}
+		if (is_top_exch)	// top halo exchange-send //
+		{
+			int pidx = (rank_z < size_z - 1) ? rank_z + 1 : 0;
+
+#pragma omp master
+			MPI_Isend(msend_z[1], msz, mpi_type< T >(), pidx, 1, comm_z, &mpi_req[3]);
+		}
+#endif
+
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_z);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::pop_exchange_halo_z(
+	T* x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_z,
+
+	MPI_Request mpi_req[4]) const
+{
+	if (hz == 0) return;		// exit on zero size message //
+	if (size_z == 1) {			// degenerate case //
+
+		if (periodic_z) {	// apply periodicity withing local array
+#pragma omp barrier
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	// finalize -z MPI exchanges //
+	// ------------------------- //
+#pragma omp master
+	MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+#pragma omp barrier
+
+	const int msz = hz *
+		(nx - ((gcx - hx) << 1)) * (ny - ((gcy - hy) << 1));    // message size in -z communicator 
+
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+	if ((mem == memGPU) || (msz <= _MPI_EXCH3D_SUBARRAY_MIN_SIZE))
+#endif
+	{
+		T *(mrecv_z[2]);	// -z message pointers //
+							// bottom halo exchange //
+		if ((rank_z > 0) || ((rank_z == 0) && (periodic_z)))
+		{
+			mrecv_z[0] = (T*)exch_z;
+			put_sub_array<mem, memCPU>(x, nx, ny, nz,
+				gcx - hx, nx - gcx + hx - 1,
+				gcy - hy, ny - gcy + hy - 1,
+				gcz - hz, gcz - 1,
+				mrecv_z[0]);
+		}
+		// top halo exchange //
+		if ((rank_z < size_z - 1) || ((rank_z == size_z - 1) && (periodic_z)))
+		{
+			mrecv_z[1] = &((T*)exch_z)[msz];
+			put_sub_array<mem, memCPU>(x, nx, ny, nz,
+				gcx - hx, nx - gcx + hx - 1,
+				gcy - hy, ny - gcy + hy - 1,
+				nz - gcz, nz - gcz + hz - 1,
+				mrecv_z[1]);
+		}
+
+#pragma omp barrier
+	}
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_z);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::push_exchange_color_halo_x(
+	T* x,
+	const int color,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x,
+
+	MPI_Request mpi_req[4]) const
+{
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+
+	if (hx == 0) return;	// exit on zero size message //
+	if (size_x == 1) {		// degenerate case //
+
+		if (periodic_x) {	// apply -x periodicity within local array
+			apply_periodic_x<mem>(x, color, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	// initialize -x MPI exchanges //
+	// --------------------------- //
+
+	T *(msend_x[2]), *(mrecv_x[2]);          // -x message pointers //
+	for (int k = 0; k < 2; k++) {
+		msend_x[k] = NULL; mrecv_x[k] = NULL;
+	}
+
+	const int is_west_exch = ((rank_x > 0) ||
+		((rank_x == 0) && (periodic_x)));
+	const int is_east_exch = ((rank_x < size_x - 1) ||
+		((rank_x == size_x - 1) && (periodic_x)));
+
+	// message size in -x communicator
+	const int msx_send_west = get_num_colored(color,
+		gcx, gcx + hx - 1, gcy - hy, ny - gcy + hy - 1, gcz - hz, nz - gcz + hz - 1);
+	const int msx_send_east = get_num_colored(color,
+		nx - gcx - hx, nx - gcx - 1, gcy - hy, ny - gcy + hy - 1, gcz - hz, nz - gcz + hz - 1);
+	const int msx_recv_west = get_num_colored(color,
+		gcx - hx, gcx - 1, gcy - hy, ny - gcy + hy - 1, gcz - hz, nz - gcz + hz - 1);
+	const int msx_recv_east = get_num_colored(color,
+		nx - gcx, nx - gcx + hx - 1, gcy - hy, ny - gcy + hy - 1, gcz - hz, nz - gcz + hz - 1);
+
+	// send-receive message sizes in -x communicator
+	const int msx_recv = msx_recv_west + msx_recv_east;
+	const int msx_send = msx_send_west + msx_send_east;
+
+	// memory allocation on demand - thread safe //
+	allocate_memory(&exch_x, &exch_size_x, sizeof(T)* (msx_recv + msx_send));
+
+	if (is_west_exch)	// west halo exchange-recv //
+	{
+		int pidx = (rank_x > 0) ? rank_x - 1 : size_x - 1;
+
+		mrecv_x[0] = (T*)exch_x;
+		msend_x[0] = &((T*)exch_x)[msx_recv];
+
+#pragma omp master
+		MPI_Irecv(mrecv_x[0], msx_recv_west, mpi_type< T >(), pidx, 1, comm_x, &mpi_req[0]);
+
+		get_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			gcx, gcx + hx - 1,
+			gcy - hy, ny - gcy + hy - 1,
+			gcz - hz, nz - gcz + hz - 1,
+			msend_x[0]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+		MPI_Isend(msend_x[0], msx_send_west, mpi_type< T >(), pidx, 0, comm_x, &mpi_req[2]);
+#endif
+	}
+	if (is_east_exch)	// east halo exchange-recv //
+	{
+		int pidx = (rank_x < size_x - 1) ? rank_x + 1 : 0;
+
+		mrecv_x[1] = &((T*)exch_x)[msx_recv_west];
+		msend_x[1] = &((T*)exch_x)[msx_recv + msx_send_west];
+
+#pragma omp master
+		MPI_Irecv(mrecv_x[1], msx_recv_east, mpi_type< T >(), pidx, 0, comm_x, &mpi_req[1]);
+
+		get_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			nx - gcx - hx, nx - gcx - 1,
+			gcy - hy, ny - gcy + hy - 1,
+			gcz - hz, nz - gcz + hz - 1,
+			msend_x[1]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+		MPI_Isend(msend_x[1], msx_send_east, mpi_type< T >(), pidx, 1, comm_x, &mpi_req[3]);
+#endif
+	}
+
+#ifndef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+
+	if (is_west_exch)	// west halo exchange-send //
+	{
+		int pidx = (rank_x > 0) ? rank_x - 1 : size_x - 1;
+
+#pragma omp master
+		MPI_Isend(msend_x[0], msx_send_west, mpi_type< T >(), pidx, 0, comm_x, &mpi_req[2]);
+	}
+	if (is_east_exch)	// east halo exchange-send //
+	{
+		int pidx = (rank_x < size_x - 1) ? rank_x + 1 : 0;
+
+#pragma omp master
+		MPI_Isend(msend_x[1], msx_send_east, mpi_type< T >(), pidx, 1, comm_x, &mpi_req[3]);
+	}
+#endif
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_x);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::pop_exchange_color_halo_x(
+	T* x,
+	const int color,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x,
+
+	MPI_Request mpi_req[4]) const
+{
+	if (hx == 0) return;		// exit on zero size message //
+	if (size_x == 1) {			// degenerate case //
+
+		if (periodic_x) {	// apply -x periodicity within local array
+#pragma omp barrier
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	// message size in -x communicator
+	const int msx_recv_west = get_num_colored(color,
+		gcx - hx, gcx - 1, gcy - hy, ny - gcy + hy - 1, gcz - hz, nz - gcz + hz - 1);
+	const int msx_recv_east = get_num_colored(color,
+		nx - gcx, nx - gcx + hx - 1, gcy - hy, ny - gcy + hy - 1, gcz - hz, nz - gcz + hz - 1);
+
+	// finalize -x MPI exchanges //
+	// ------------------------- //
+#pragma omp master
+	MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+#pragma omp barrier
+
+	T *(mrecv_x[2]);	// -x message pointers //
+						// west halo exchange //
+	if ((rank_x > 0) || ((rank_x == 0) && (periodic_x)))
+	{
+		mrecv_x[0] = (T*)exch_x;
+		put_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			gcx - hx, gcx - 1,
+			gcy - hy, ny - gcy + hy - 1,
+			gcz - hz, nz - gcz + hz - 1,
+			mrecv_x[0]);
+	}
+	// east halo exchange //
+	if ((rank_x < size_x - 1) || ((rank_x == size_x - 1) && (periodic_x)))
+	{
+		mrecv_x[1] = &((T*)exch_x)[msx_recv_west];
+		put_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			nx - gcx, nx - gcx + hx - 1,
+			gcy - hy, ny - gcy + hy - 1,
+			gcz - hz, nz - gcz + hz - 1,
+			mrecv_x[1]);
+	}
+
+#pragma omp barrier
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_x);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::push_exchange_color_halo_y(
+	T* x,
+	const int color,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_y,
+
+	MPI_Request mpi_req[4]) const
+{
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+
+	if (hy == 0) return;	// exit on zero size message //
+	if (size_y == 1) {		// degenerate case //
+
+		if (periodic_y) {	// apply -y periodicity within local array
+			apply_periodic_y<mem>(x, color, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	// initialize -y MPI exchanges //
+	// --------------------------- //
+
+	T *(msend_y[2]), *(mrecv_y[2]);          // -y message pointers //
+	for (int k = 0; k < 2; k++) {
+		msend_y[k] = NULL; mrecv_y[k] = NULL;
+	}
+
+	const int is_south_exch = ((rank_y > 0) ||
+		((rank_y == 0) && (periodic_y)));
+	const int is_north_exch = ((rank_y < size_y - 1) ||
+		((rank_y == size_y - 1) && (periodic_y)));
+
+	// message size in -y communicator
+	const int msy_send_south = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, gcy, gcy + hy - 1, gcz - hz, nz - gcz + hz - 1);
+	const int msy_send_north = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, ny - gcy - hy, ny - gcy - 1, gcz - hz, nz - gcz + hz - 1);
+	const int msy_recv_south = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, gcy - hy, gcy - 1, gcz - hz, nz - gcz + hz - 1);
+	const int msy_recv_north = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, ny - gcy, ny - gcy + hy - 1, gcz - hz, nz - gcz + hz - 1);
+
+	// send-recieve message sizes in -y communicator
+	const int msy_recv = msy_recv_south + msy_recv_north;
+	const int msy_send = msy_send_south + msy_send_north;
+
+	// memory allocation on demand - thread safe //
+	allocate_memory(&exch_y, &exch_size_y, sizeof(T)* (msy_recv + msy_send));
+
+	if (is_south_exch)	// south halo exchange-recv //
+	{
+		int pidx = (rank_y > 0) ? rank_y - 1 : size_y - 1;
+
+		mrecv_y[0] = (T*)exch_y;
+		msend_y[0] = &((T*)exch_y)[msy_recv];
+
+#pragma omp master
+		MPI_Irecv(mrecv_y[0], msy_recv_south, mpi_type< T >(), pidx, 1, comm_y, &mpi_req[0]);
+
+		get_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			gcx - hx, nx - gcx + hx - 1,
+			gcy, gcy + hy - 1,
+			gcz - hz, nz - gcz + hz - 1,
+			msend_y[0]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+		MPI_Isend(msend_y[0], msy_send_south, mpi_type< T >(), pidx, 0, comm_y, &mpi_req[2]);
+#endif
+	}
+	if (is_north_exch)	// north halo exchange-recv //
+	{
+		int pidx = (rank_y < size_y - 1) ? rank_y + 1 : 0;
+
+		mrecv_y[1] = &((T*)exch_y)[msy_recv_south];
+		msend_y[1] = &((T*)exch_y)[msy_recv + msy_send_south];
+
+#pragma omp master
+		MPI_Irecv(mrecv_y[1], msy_recv_north, mpi_type< T >(), pidx, 0, comm_y, &mpi_req[1]);
+
+		get_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			gcx - hx, nx - gcx + hx - 1,
+			ny - gcy - hy, ny - gcy - 1,
+			gcz - hz, nz - gcz + hz - 1,
+			msend_y[1]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+		MPI_Isend(msend_y[1], msy_send_north, mpi_type< T >(), pidx, 1, comm_y, &mpi_req[3]);
+#endif
+	}
+
+#ifndef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+
+	if (is_south_exch)	// south halo exchange-send //
+	{
+		int pidx = (rank_y > 0) ? rank_y - 1 : size_y - 1;
+
+#pragma omp master
+		MPI_Isend(msend_y[0], msy_send_south, mpi_type< T >(), pidx, 0, comm_y, &mpi_req[2]);
+	}
+	if (is_north_exch)	// north halo exchange-send //
+	{
+		int pidx = (rank_y < size_y - 1) ? rank_y + 1 : 0;
+
+#pragma omp master
+		MPI_Isend(msend_y[1], msy_send_north, mpi_type< T >(), pidx, 1, comm_y, &mpi_req[3]);
+	}
+#endif
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_y);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::pop_exchange_color_halo_y(
+	T* x,
+	const int color,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_y,
+
+	MPI_Request mpi_req[4]) const
+{
+	if (hy == 0) return;		// exit on zero size message //
+	if (size_y == 1) {			// degenerate case //
+
+		if (periodic_y) {	// apply -y periodicity within local array
+#pragma omp barrier
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	// message size in -y communicator
+	const int msy_recv_south = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, gcy - hy, gcy - 1, gcz - hz, nz - gcz + hz - 1);
+	const int msy_recv_north = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, ny - gcy, ny - gcy + hy - 1, gcz - hz, nz - gcz + hz - 1);
+
+	// finalize -y MPI exchanges //
+	// ------------------------- //
+#pragma omp master
+	MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+#pragma omp barrier
+
+	T *(mrecv_y[2]);	// -y message pointers //
+						// south halo exchange //
+	if ((rank_y > 0) || ((rank_y == 0) && (periodic_y)))
+	{
+		mrecv_y[0] = (T*)exch_y;
+		put_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			gcx - hx, nx - gcx + hx - 1,
+			gcy - hy, gcy - 1,
+			gcz - hz, nz - gcz + hz - 1,
+			mrecv_y[0]);
+	}
+	// north halo exchange //
+	if ((rank_y < size_y - 1) || ((rank_y == size_y - 1) && (periodic_y)))
+	{
+		mrecv_y[1] = &((T*)exch_y)[msy_recv_south];
+		put_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			gcx - hx, nx - gcx + hx - 1,
+			ny - gcy, ny - gcy + hy - 1,
+			gcz - hz, nz - gcz + hz - 1,
+			mrecv_y[1]);
+	}
+
+#pragma omp barrier
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_y);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::push_exchange_color_halo_z(
+	T* x,
+	const int color,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_z,
+
+	MPI_Request mpi_req[4]) const
+{
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+
+	if (hz == 0) return;	// exit on zero size message //
+	if (size_z == 1) {		// degenerate case //
+
+		if (periodic_z) {	// apply periodicity withing local array
+			apply_periodic_z<mem>(x, color, nx, ny, nz,
+				gcx, gcy, gcz, hx, hy, hz);
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	// initialize -z MPI exchanges //
+	// --------------------------- //
+
+	T *(msend_z[2]), *(mrecv_z[2]);          // -z message pointers //
+	for (int k = 0; k < 2; k++) {
+		msend_z[k] = NULL; mrecv_z[k] = NULL;
+	}
+
+	const int is_bottom_exch = ((rank_z > 0) ||
+		((rank_z == 0) && (periodic_z)));
+	const int is_top_exch = ((rank_z < size_z - 1) ||
+		((rank_z == size_z - 1) && (periodic_z)));
+
+	// message size in -z communicator
+	const int msz_send_bottom = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, gcy - hy, ny - gcy + hy - 1, gcz, gcz + hz - 1);
+	const int msz_send_top = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, gcy - hy, ny - gcy + hy - 1, nz - gcz - hz, nz - gcz - 1);
+	const int msz_recv_bottom = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, gcy - hy, ny - gcy + hy - 1, gcz - hz, gcz - 1);
+	const int msz_recv_top = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, gcy - hy, ny - gcy + hy - 1, nz - gcz, nz - gcz + hz - 1);
+
+	// send-recieve message sizes in -z communicator
+	const int msz_recv = msz_recv_bottom + msz_recv_top;
+	const int msz_send = msz_send_bottom + msz_send_top;
+
+	// memory allocation on demand - thread safe //
+	allocate_memory(&exch_z, &exch_size_z, sizeof(T)* (msz_recv + msz_send));
+
+	if (is_bottom_exch)	// bottom halo exchange-recv //
+	{
+		int pidx = (rank_z > 0) ? rank_z - 1 : size_z - 1;
+
+		mrecv_z[0] = (T*)exch_z;
+		msend_z[0] = &((T*)exch_z)[msz_recv];
+
+#pragma omp master
+		MPI_Irecv(mrecv_z[0], msz_recv_bottom, mpi_type< T >(), pidx, 1, comm_z, &mpi_req[0]);
+
+		get_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			gcx - hx, nx - gcx + hx - 1,
+			gcy - hy, ny - gcy + hy - 1,
+			gcz, gcz + hz - 1,
+			msend_z[0]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+		MPI_Isend(msend_z[0], msz_send_bottom, mpi_type< T >(), pidx, 0, comm_z, &mpi_req[2]);
+#endif
+	}
+	if (is_top_exch)	// top halo exchange-recv //
+	{
+		int pidx = (rank_z < size_z - 1) ? rank_z + 1 : 0;
+
+		mrecv_z[1] = &((T*)exch_z)[msz_recv_bottom];
+		msend_z[1] = &((T*)exch_z)[msz_recv + msz_send_bottom];
+
+#pragma omp master
+		MPI_Irecv(mrecv_z[1], msz_recv_top, mpi_type< T >(), pidx, 0, comm_z, &mpi_req[1]);
+
+		get_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			gcx - hx, nx - gcx + hx - 1,
+			gcy - hy, ny - gcy + hy - 1,
+			nz - gcz - hz, nz - gcz - 1,
+			msend_z[1]);
+
+#ifdef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+#pragma omp master
+		MPI_Isend(msend_z[1], msz_send_top, mpi_type< T >(), pidx, 1, comm_z, &mpi_req[3]);
+#endif
+	}
+
+#ifndef MPI_EXCH_KEEP_RECVSEND_PAIR
+#pragma omp barrier
+
+	if (is_bottom_exch)	// bottom halo exchange-send //
+	{
+		int pidx = (rank_z > 0) ? rank_z - 1 : size_z - 1;
+
+#pragma omp master
+		MPI_Isend(msend_z[0], msz_send_bottom, mpi_type< T >(), pidx, 0, comm_z, &mpi_req[2]);
+	}
+	if (is_top_exch)	// top halo exchange-send //
+	{
+		int pidx = (rank_z < size_z - 1) ? rank_z + 1 : 0;
+
+#pragma omp master
+		MPI_Isend(msend_z[1], msz_send_top, mpi_type< T >(), pidx, 1, comm_z, &mpi_req[3]);
+	}
+#endif
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_z);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::pop_exchange_color_halo_z(
+	T* x,
+	const int color,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_z,
+
+	MPI_Request mpi_req[4]) const
+{
+	if (hz == 0) return;		// exit on zero size message //
+	if (size_z == 1) {			// degenerate case //
+
+		if (periodic_z) {	// apply periodicity withing local array
+#pragma omp barrier
+		}
+		return;
+	}
+
+	double start_time;
+#pragma omp master
+	start_time = timer_init();
+
+	// message size in -z communicator
+	const int msz_recv_bottom = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, gcy - hy, ny - gcy + hy - 1, gcz - hz, gcz - 1);
+	const int msz_recv_top = get_num_colored(color,
+		gcx - hx, nx - gcx + hx - 1, gcy - hy, ny - gcy + hy - 1, nz - gcz, nz - gcz + hz - 1);
+
+	// finalize -z MPI exchanges //
+	// ------------------------- //
+#pragma omp master
+	MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+#pragma omp barrier
+
+	T *(mrecv_z[2]);	// -z message pointers //
+						// bottom halo exchange //
+	if ((rank_z > 0) || ((rank_z == 0) && (periodic_z)))
+	{
+		mrecv_z[0] = (T*)exch_z;
+		put_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			gcx - hx, nx - gcx + hx - 1,
+			gcy - hy, ny - gcy + hy - 1,
+			gcz - hz, gcz - 1,
+			mrecv_z[0]);
+	}
+	// top halo exchange //
+	if ((rank_z < size_z - 1) || ((rank_z == size_z - 1) && (periodic_z)))
+	{
+		mrecv_z[1] = &((T*)exch_z)[msz_recv_bottom];
+		put_sub_array<mem, memCPU>(x, color, nx, ny, nz,
+			gcx - hx, nx - gcx + hx - 1,
+			gcy - hy, ny - gcy + hy - 1,
+			nz - gcz, nz - gcz + hz - 1,
+			mrecv_z[1]);
+	}
+
+#pragma omp barrier
+
+#pragma omp master
+	timer_update(start_time, &cpu_time_exch, &cpu_time_exch_z);
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::push_exchange_halo(
+	T* x,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z,
+
+	MPI_Request mpi_req[4], int* status) const
+{
+	static int check;	// using static to make variable shared //
+
+						// -x exchange //
+	(*status) = 1;
+	push_exchange_halo_x<mem>(x, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_x, mpi_req);
+
+#pragma omp master
+	check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+	if (!check) return;	// -x exchange signal //
+	pop_exchange_halo_x<mem>(x, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_x, mpi_req);
+
+	// -y exchange //
+	(*status) = 2;
+	push_exchange_halo_y<mem>(x, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+
+#pragma omp master
+	check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+	if (!check) return;	// -y exchange signal //
+	pop_exchange_halo_y<mem>(x, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+
+	// -z exchange //
+	(*status) = 3;
+	push_exchange_halo_z<mem>(x, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+
+#pragma omp master
+	check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+	if (!check) return;	// -z exchange signal //
+	pop_exchange_halo_z<mem>(x, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+
+	(*status) = 0;		// exchanges done //
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::ping_exchange_halo(
+	T* x,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z,
+
+	MPI_Request mpi_req[4], int* status) const
+{
+	static int check;	// using static to make variable shared //
+
+#pragma omp barrier
+						// using barrier to prevent races for [check]
+
+	if ((*status) == 1) {	// -x exchange processing //
+
+#pragma omp master
+		check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+		if (!check) return;	// -x exchange signal //
+		pop_exchange_halo_x<mem>(x, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_x, mpi_req);
+
+		// - y exchange //
+		(*status) = 2;
+		push_exchange_halo_y<mem>(x, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+
+	}
+	if ((*status) == 2) {	// -y exchange processing //
+
+#pragma omp master
+		check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+		if (!check) return;	// -y exchange signal //
+		pop_exchange_halo_y<mem>(x, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+
+		// -z exchange //
+		(*status) = 3;
+		push_exchange_halo_z<mem>(x, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+	}
+	if ((*status) == 3) {	// -z exchange processing //
+
+#pragma omp master
+		check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+		if (!check) return;	// -z exchange signal //
+		pop_exchange_halo_z<mem>(x, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+	}
+
+	(*status) = 0;		// exchanges done //
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::pop_exchange_halo(
+	T* x,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z,
+
+	MPI_Request mpi_req[4], int* status) const
+{
+	if ((*status) == 1) {	// -x exchange processing //
+
+		pop_exchange_halo_x<mem>(x, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_x, mpi_req);
+
+		// - y exchange //
+		(*status) = 2;
+		push_exchange_halo_y<mem>(x, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+	}
+	if ((*status) == 2) {	// -y exchange processing //
+
+		pop_exchange_halo_y<mem>(x, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+
+		// -z exchange //
+		(*status) = 3;
+		push_exchange_halo_z<mem>(x, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+	}
+	if ((*status) == 3) {	// -z exchange processing //
+
+		pop_exchange_halo_z<mem>(x, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+	}
+
+	(*status) = 0;		// exchanges done //
+
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::push_exchange_color_halo(
+	T* x,
+	const int color,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z,
+
+	MPI_Request mpi_req[4], int* status) const
+{
+	static int check;	// using static to make variable shared //
+
+						// -x exchange //
+	(*status) = 1;
+	push_exchange_color_halo_x<mem>(x, color, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_x, mpi_req);
+
+#pragma omp master
+	check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+	if (!check) return;	// -x exchange signal //
+	pop_exchange_color_halo_x<mem>(x, color, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_x, mpi_req);
+
+	// -y exchange //
+	(*status) = 2;
+	push_exchange_color_halo_y<mem>(x, color, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+
+#pragma omp master
+	check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+	if (!check) return;	// -y exchange signal //
+	pop_exchange_color_halo_y<mem>(x, color, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+
+	// -z exchange //
+	(*status) = 3;
+	push_exchange_color_halo_z<mem>(x, color, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+
+#pragma omp master
+	check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+	if (!check) return;	// -z exchange signal //
+	pop_exchange_color_halo_z<mem>(x, color, nx, ny, nz,
+		gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+
+	(*status) = 0;		// exchanges done //
+
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::ping_exchange_color_halo(
+	T* x,
+	const int color,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z,
+
+	MPI_Request mpi_req[4], int* status) const
+{
+	static int check;	// using static to make variable shared //
+
+#pragma omp barrier
+						// using barrier to prevent races for [check]
+
+	if ((*status) == 1) {	// -x exchange processing //
+
+#pragma omp master
+		check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+		if (!check) return;	// -x exchange signal //
+		pop_exchange_color_halo_x<mem>(x, color, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_x, mpi_req);
+
+		// - y exchange //
+		(*status) = 2;
+		push_exchange_color_halo_y<mem>(x, color, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+	}
+	if ((*status) == 2) {	// -y exchange processing //
+
+#pragma omp master
+		check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+		if (!check) return;	// -y exchange signal //
+		pop_exchange_color_halo_y<mem>(x, color, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+
+		// -z exchange //
+		(*status) = 3;
+		push_exchange_color_halo_z<mem>(x, color, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+	}
+	if ((*status) == 3) {	// -z exchange processing //
+
+#pragma omp master
+		check = test_exchange(mpi_req, 4);
+#pragma omp barrier
+		if (!check) return;	// -z exchange signal //
+		pop_exchange_color_halo_z<mem>(x, color, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+	}
+
+	(*status) = 0;		// exchanges done //
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+}
+
+template< nse::memType mem, typename T >
+void nse::mpiCom3d::pop_exchange_color_halo(
+	T* x,
+	const int color,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int hx, const int hy, const int hz,
+	const int periodic_x, const int periodic_y, const int periodic_z,
+
+	MPI_Request mpi_req[4], int* status) const
+{
+	if ((*status) == 1) {	// -x exchange processing //
+
+		pop_exchange_color_halo_x<mem>(x, color, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_x, mpi_req);
+
+		// - y exchange //
+		(*status) = 2;
+		push_exchange_color_halo_y<mem>(x, color, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+	}
+	if ((*status) == 2) {	// -y exchange processing //
+
+		pop_exchange_color_halo_y<mem>(x, color, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_y, mpi_req);
+
+		// -z exchange //
+		(*status) = 3;
+		push_exchange_color_halo_z<mem>(x, color, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+	}
+	if ((*status) == 3) {	// -z exchange processing //
+
+		pop_exchange_color_halo_z<mem>(x, color, nx, ny, nz,
+			gcx, gcy, gcz, hx, hy, hz, periodic_z, mpi_req);
+	}
+
+	(*status) = 0;		// exchanges done //
+#pragma omp master
+	for (int k = 0; k < 4; k++)
+		mpi_req[k] = MPI_REQUEST_NULL;
+}
+
+inline int nse::mpiCom3d::test_exchange(
+	MPI_Request* mpi_req, const int n_req) const
+{
+#ifdef _MPI_EXCH3D_MEASURE_TEST_TIME
+	double start_time = timer_init();
+#endif
+
+	int flag;
+	MPI_Testall(n_req, mpi_req, &flag, MPI_STATUSES_IGNORE);
+
+#ifdef _MPI_EXCH3D_MEASURE_TEST_TIME
+	timer_update(start_time, &cpu_time_exch);
+#endif
+
+	return flag;
+}
+// ================================================================================== //
+
+inline int nse::mpiCom3d::offset_x(const int nx, const int gcx) const
+{
+	int cx = nx - 2 * gcx, in_cx = cx;
+	int mpi_cx = mpi_allreduce(cx, MPI_SUM, comm) / (size_y * size_z);
+
+	int offset;
+	MPI_Scan(&in_cx, &offset, 1, MPI_INT, MPI_SUM, comm);
+
+	return offset - mpi_cx * rank_y - mpi_cx * rank_z * size_y - cx;
+}
+
+inline int nse::mpiCom3d::offset_y(const int ny, const int gcy) const
+{
+	int cy = ny - 2 * gcy, in_cy = (rank_x == 0) ? cy : 0;
+	int mpi_cy = mpi_allreduce(cy, MPI_SUM, comm) / (size_x * size_z);
+
+	int offset;
+	MPI_Scan(&in_cy, &offset, 1, MPI_INT, MPI_SUM, comm);
+
+	return offset - rank_z * mpi_cy - cy;
+}
+
+inline int nse::mpiCom3d::offset_z(const int nz, const int gcz) const
+{
+	int cz = nz - 2 * gcz, in_cz = ((rank_x == 0) && (rank_y == 0)) ? cz : 0;
+
+	int offset;
+	MPI_Scan(&in_cz, &offset, 1, MPI_INT, MPI_SUM, comm);
+
+	return offset - cz;
+}
+
+
+// * Private * //
+// ---------------------------------------------------------------------------------- //
+inline void nse::mpiCom3d::allocate_memory(
+	void** memory, size_t* memory_size, const size_t size)
+{
+	if ((*memory_size) < size) {
+
+#pragma omp barrier
+#pragma omp single
+		{
+			deallocate_memory((*memory), memory_size);
+
+			allocate_void(memory, size);
+			(*memory_size) = size;
+		}
+		// implicit OpenMP barrier //
+	}
+}
+
+inline void nse::mpiCom3d::deallocate_memory(
+	void* memory, size_t* memory_size)
+{
+	if ((*memory_size) > 0) {
+		deallocate_void(memory);
+		(*memory_size) = (size_t)0;
+	}
+}
+
+#ifdef _MPI_EXCH3D_USE_SUBARRAY
+inline void nse::mpiCom3d::init_subarray_list()
+{
+	for (int k = 0; k < subarray_list_size; k++) {
+		for (int m = 0; m < 10; m++) {
+			subarray_info[k][m] = 0;
+		}
+	}
+	subarray_ptr = 0;
+}
+
+inline void nse::mpiCom3d::free_subarray_list()
+{
+	for (int k = 0; k < subarray_list_size; k++) {
+		if (subarray_info[k][0] == 1) {
+			MPI_Type_free(&subarray_list[k]);
+			for (int m = 0; m < 10; m++) {
+				subarray_info[k][m] = 0;
+			}
+		}
+	}
+	subarray_ptr = 0;
+}
+
+template<typename T>
+inline void nse::mpiCom3d::get_subarray(MPI_Datatype* subarray,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke)
+{
+	int mark = 0;
+	for (int k = 0; k < subarray_ptr; k++) {
+
+		// compare parameters //
+		if (subarray_info[k][0] == 1) {
+			mark = (
+				(subarray_info[k][1] == nx) &&
+				(subarray_info[k][2] == ny) &&
+				(subarray_info[k][3] == nz) &&
+				(subarray_info[k][4] == ib) &&
+				(subarray_info[k][5] == ie) &&
+				(subarray_info[k][6] == jb) &&
+				(subarray_info[k][7] == je) &&
+				(subarray_info[k][8] == kb) &&
+				(subarray_info[k][9] == ke));
+
+			if (mark) {
+				(*subarray) = subarray_list[k];
+				return;
+			}
+		}
+	}
+
+	int size[3], subsize[3], pos[3];
+	size[0] = nx;
+	size[1] = ny;
+	size[2] = nz;
+
+	subsize[0] = ie - ib + 1;
+	subsize[1] = je - jb + 1;
+	subsize[2] = ke - kb + 1;
+
+	pos[0] = ib;
+	pos[1] = jb;
+	pos[2] = kb;
+
+	MPI_Type_create_subarray(3, size, subsize, pos,
+		MPI_ORDER_C, mpi_type< T >(), &subarray_list[subarray_ptr]);
+	MPI_Type_commit(&subarray_list[subarray_ptr]);
+
+	subarray_info[subarray_ptr][0] = 1;
+	subarray_info[subarray_ptr][1] = nx;
+	subarray_info[subarray_ptr][2] = ny;
+	subarray_info[subarray_ptr][3] = nz;
+	subarray_info[subarray_ptr][4] = ib;
+	subarray_info[subarray_ptr][5] = ie;
+	subarray_info[subarray_ptr][6] = jb;
+	subarray_info[subarray_ptr][7] = je;
+	subarray_info[subarray_ptr][8] = kb;
+	subarray_info[subarray_ptr][9] = ke;
+
+	(*subarray) = subarray_list[subarray_ptr];
+	subarray_ptr++;
+}
+#endif
diff --git a/mpi-vecmath.h b/mpi-vecmath.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa49ef9d443b69c44cc5719347c9344d72a9d44d
--- /dev/null
+++ b/mpi-vecmath.h
@@ -0,0 +1,646 @@
+#pragma once
+
+// [mpi-vecmath.h]: MPI vector math simple template functions
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include <mpi.h>
+#include "nse-sys.h"
+#include "mpi-com.h"
+#include "vecmath.h"
+
+
+namespace nse
+{
+	// * array reduction * //
+	template< memType mem = memCPU, typename T >
+	T mpi_min(const T* _RESTRICT const x, const int n);
+	template< memType mem = memCPU, typename T >
+	T mpi_min(const T* _RESTRICT const x, const int n,
+		const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	T mpi_max(const T* _RESTRICT const x, const int n);
+	template< memType mem = memCPU, typename T >
+	T mpi_max(const T* _RESTRICT const x, const int n,
+		const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	T mpi_sum(const T* _RESTRICT const x, const int n);
+	template< memType mem = memCPU, typename T >
+	T mpi_sum(const T* _RESTRICT const x, const int n,
+		const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	T mpi_dot_product(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n);
+	template< memType mem = memCPU, typename T >
+	T mpi_dot_product(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+		const MPI_Comm comm);
+
+	template< typename T, typename CType >
+	T mpi_dot_product_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+	template< typename T, typename CType >
+	T mpi_dot_product_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	T mpi_sqr_sum(const T* _RESTRICT const x, const int n);
+	template< memType mem = memCPU, typename T >
+	T mpi_sqr_sum(const T* _RESTRICT const x, const int n,
+		const MPI_Comm comm);
+
+	template< typename T, typename CType >
+	T mpi_sqr_sum_ifeq(const T* _RESTRICT const x,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+	template< typename T, typename CType >
+	T mpi_sqr_sum_ifeq(const T* _RESTRICT const x,
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	void mpi_sqr_sum_and_dp(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+		T* _RESTRICT sum, T* _RESTRICT dp);
+	template< memType mem = memCPU, typename T >
+	void mpi_sqr_sum_and_dp(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+		const MPI_Comm comm,
+		T* _RESTRICT sum, T* _RESTRICT dp);
+
+	template< typename T, typename CType >
+	void mpi_sqr_sum_and_dp_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		T* _RESTRICT sum, T* _RESTRICT dp);
+	template< typename T, typename CType >
+	void mpi_sqr_sum_and_dp_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		const MPI_Comm comm,
+		T* _RESTRICT sum, T* _RESTRICT dp);
+
+	template< memType mem = memCPU, typename T >
+	T mpi_lnorm(const T* _RESTRICT const x, const int n);
+	template< memType mem = memCPU, typename T >
+	T mpi_lnorm(const T* _RESTRICT const x, const int n,
+		const MPI_Comm comm);
+
+	template< typename T, typename CType >
+	T mpi_lnorm_ifeq(const T* _RESTRICT const x,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+	template< typename T, typename CType >
+	T mpi_lnorm_ifeq(const T* _RESTRICT const x,
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	void mpi_lnorm_and_dp(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+	template< memType mem = memCPU, typename T >
+	void mpi_lnorm_and_dp(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+		const MPI_Comm comm,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+
+	template< typename T, typename CType >
+	void mpi_lnorm_and_dp_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+	template< typename T, typename CType >
+	void mpi_lnorm_and_dp_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		const MPI_Comm comm,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+
+	template< memType mem = memCPU, typename T >
+	void mpi_lnorm_and_sqr_sum(const T* _RESTRICT const x, const int n,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+	template< memType mem = memCPU, typename T >
+	void mpi_lnorm_and_sqr_sum(const T* _RESTRICT const x, const int n,
+		const MPI_Comm comm,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+
+	template< typename T, typename CType >
+	void mpi_lnorm_and_sqr_sum_ifeq(const T* _RESTRICT const x, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+	template< typename T, typename CType >
+	void mpi_lnorm_and_sqr_sum_ifeq(const T* _RESTRICT const x, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		const MPI_Comm comm,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+
+	template< memType mem = memCPU, typename T >
+	T mpi_cnorm(const T* _RESTRICT const x, const int n);
+	template< memType mem = memCPU, typename T >
+	T mpi_cnorm(const T* _RESTRICT const x, const int n,
+		const MPI_Comm comm);
+
+	template< typename T, typename CType >
+	T mpi_cnorm_ifeq(const T* _RESTRICT const x,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+	template< typename T, typename CType >
+	T mpi_cnorm_ifeq(const T* _RESTRICT const x,
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		const MPI_Comm comm);
+
+	template< memType mem = memCPU, typename T >
+	void mpi_cnorm_and_dp(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+	template< memType mem = memCPU, typename T >
+	void mpi_cnorm_and_dp(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+		const MPI_Comm comm,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+
+	template< typename T, typename CType >
+	void mpi_cnorm_and_dp_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+	template< typename T, typename CType >
+	void mpi_cnorm_and_dp_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		const MPI_Comm comm,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+
+	template< memType mem = memCPU, typename T >
+	void mpi_cnorm_and_sqr_sum(const T* _RESTRICT const x, const int n,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+	template< memType mem = memCPU, typename T >
+	void mpi_cnorm_and_sqr_sum(const T* _RESTRICT const x, const int n,
+		const MPI_Comm comm,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+
+	template< typename T, typename CType >
+	void mpi_cnorm_and_sqr_sum_ifeq(const T* _RESTRICT const x, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+	template< typename T, typename CType >
+	void mpi_cnorm_and_sqr_sum_ifeq(const T* _RESTRICT const x, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		const MPI_Comm comm,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+
+	template< memType mem = memCPU, typename T >
+	T mpi_l1norm(const T* _RESTRICT const x, const int n);
+	template< memType mem = memCPU, typename T >
+	T mpi_l1norm(const T* _RESTRICT const x, const int n,
+		const MPI_Comm comm);
+
+	template< typename T >
+	T mpi_max_deviation(const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n);
+	template< typename T >
+	T mpi_max_deviation(const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n,
+		const MPI_Comm comm);
+	// ----------------------------------------------------------------------------------------- //
+
+	// * check for finite values * //
+	template< memType mem = memCPU, typename T >
+	bool mpi_is_finite(const T* _RESTRICT const x, const int n);
+	template< memType mem = memCPU, typename T >
+	bool mpi_is_finite(const T* _RESTRICT const x, const int n,
+		const MPI_Comm comm);
+	// ----------------------------------------------------------------------------------------- //
+}
+
+// ----------------------------------------------------------------------------------------- //
+// * Implementation *
+// ----------------------------------------------------------------------------------------- //
+template< nse::memType mem, typename T >
+inline T nse::mpi_min(
+	const T* _RESTRICT const x, const int n)
+{
+	return mpi_allreduce(min<mem>(x, n), MPI_MIN, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_min(
+	const T* _RESTRICT const x, const int n, const MPI_Comm comm)
+{
+	return mpi_allreduce(min<mem>(x, n), MPI_MIN, comm);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_max(
+	const T* _RESTRICT const x, const int n)
+{
+	return mpi_allreduce(max<mem>(x, n), MPI_MAX, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_max(
+	const T* _RESTRICT const x, const int n, const MPI_Comm comm)
+{
+	return mpi_allreduce(max<mem>(x, n), MPI_MAX, comm);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_sum(
+	const T* _RESTRICT const x, const int n)
+{
+	return mpi_allreduce(sum<mem>(x, n), MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_sum(
+	const T* _RESTRICT const x, const int n, const MPI_Comm comm)
+{
+	return mpi_allreduce(sum<mem>(x, n), MPI_SUM, comm);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_dot_product(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n)
+{
+	return mpi_allreduce(dot_product<mem>(x, y, n), MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_dot_product(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+	const MPI_Comm comm)
+{
+	return mpi_allreduce(dot_product<mem>(x, y, n), MPI_SUM, comm);
+}
+
+template< typename T, typename CType >
+inline T nse::mpi_dot_product_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+	return mpi_allreduce(dot_product_ifeq(x, y, mask, check, n), MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< typename T, typename CType >
+inline T nse::mpi_dot_product_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	const MPI_Comm comm)
+{
+	return mpi_allreduce(dot_product_ifeq(x, y, mask, check, n), MPI_SUM, comm);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_sqr_sum(
+	const T* _RESTRICT const x, const int n)
+{
+	return mpi_allreduce(sqr_sum<mem>(x, n), MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_sqr_sum(
+	const T* _RESTRICT const x, const int n, const MPI_Comm comm)
+{
+	return mpi_allreduce(sqr_sum<mem>(x, n), MPI_SUM, comm);
+}
+
+template< typename T, typename CType >
+inline T nse::mpi_sqr_sum_ifeq(const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+	return mpi_allreduce(sqr_sum_ifeq(x, mask, check, n), MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< typename T, typename CType >
+inline T nse::mpi_sqr_sum_ifeq(const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	const MPI_Comm comm)
+{
+	return mpi_allreduce(sqr_sum_ifeq(x, mask, check, n), MPI_SUM, comm);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_sqr_sum_and_dp(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+	T* _RESTRICT sum, T* _RESTRICT dp)
+{
+	sqr_sum_and_dp<mem>(x, y, n, sum, dp);
+	mpi_allreduce(sum, dp, MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_sqr_sum_and_dp(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+	const MPI_Comm comm,
+	T* _RESTRICT sum, T* _RESTRICT dp)
+{
+	sqr_sum_and_dp<mem>(x, y, n, sum, dp);
+	mpi_allreduce(sum, dp, MPI_SUM, comm);
+}
+
+template< typename T, typename CType >
+inline void nse::mpi_sqr_sum_and_dp_ifeq(
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	T* _RESTRICT sum, T* _RESTRICT dp)
+{
+	sqr_sum_and_dp_ifeq(x, y, mask, check, n, sum, dp);
+	mpi_allreduce(sum, dp, MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< typename T, typename CType >
+inline void nse::mpi_sqr_sum_and_dp_ifeq(
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	const MPI_Comm comm,
+	T* _RESTRICT sum, T* _RESTRICT dp)
+{
+	sqr_sum_and_dp_ifeq(x, y, mask, check, n, sum, dp);
+	mpi_allreduce(sum, dp, MPI_SUM, comm);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_lnorm(
+	const T* _RESTRICT const x, const int n)
+{
+	return sqrt(mpi_allreduce(sqr_sum<mem>(x, n), MPI_SUM, MPI_COMM_WORLD));
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_lnorm(
+	const T* _RESTRICT const x, const int n,
+	const MPI_Comm comm)
+{
+	return sqrt(mpi_allreduce(sqr_sum<mem>(x, n), MPI_SUM, comm));
+}
+
+template< typename T, typename CType >
+inline T nse::mpi_lnorm_ifeq(const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+	return sqrt(mpi_allreduce(sqr_sum_ifeq(x, mask, check, n), MPI_SUM, MPI_COMM_WORLD));
+}
+
+template< typename T, typename CType >
+inline T nse::mpi_lnorm_ifeq(const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	const MPI_Comm comm)
+{
+	return sqrt(mpi_allreduce(sqr_sum_ifeq(x, mask, check, n), MPI_SUM, comm));
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_lnorm_and_dp(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+	sqr_sum_and_dp<mem>(x, y, n, norm, dp);
+	mpi_allreduce(norm, dp, MPI_SUM, MPI_COMM_WORLD);
+
+	(*norm) = sqrt((*norm));
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_lnorm_and_dp(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+	const MPI_Comm comm,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+	sqr_sum_and_dp<mem>(x, y, n, norm, dp);
+	mpi_allreduce(norm, dp, MPI_SUM, comm);
+
+	(*norm) = sqrt((*norm));
+}
+
+template< typename T, typename CType >
+inline void nse::mpi_lnorm_and_dp_ifeq(
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+	sqr_sum_and_dp_ifeq(x, y, mask, check, n, norm, dp);
+	mpi_allreduce(norm, dp, MPI_SUM, MPI_COMM_WORLD);
+
+	(*norm) = sqrt((*norm));
+}
+
+template< typename T, typename CType >
+inline void nse::mpi_lnorm_and_dp_ifeq(
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	const MPI_Comm comm,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+	sqr_sum_and_dp_ifeq(x, y, mask, check, n, norm, dp);
+	mpi_allreduce(norm, dp, MPI_SUM, comm);
+
+	(*norm) = sqrt((*norm));
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_lnorm_and_sqr_sum(
+	const T* _RESTRICT const x, const int n,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+	(*sum) = sqr_sum<mem>(x, n);
+	mpi_allreduce(sum, MPI_SUM, MPI_COMM_WORLD);
+
+	(*norm) = sqrt((*sum));
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_lnorm_and_sqr_sum(
+	const T* _RESTRICT const x, const int n,
+	const MPI_Comm comm,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+	(*sum) = sqr_sum<mem>(x, n);
+	mpi_allreduce(sum, MPI_SUM, comm);
+
+	(*norm) = sqrt((*sum));
+}
+
+template< typename T, typename CType >
+inline void nse::mpi_lnorm_and_sqr_sum_ifeq(
+	const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+	(*sum) = sqr_sum_ifeq(x, mask, check, n);
+	mpi_allreduce(sum, MPI_SUM, MPI_COMM_WORLD);
+
+	(*norm) = sqrt((*sum));
+}
+
+template< typename T, typename CType >
+inline void nse::mpi_lnorm_and_sqr_sum_ifeq(
+	const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	const MPI_Comm comm,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+	(*sum) = sqr_sum_ifeq(x, mask, check, n);
+	mpi_allreduce(sum, MPI_SUM, comm);
+
+	(*norm) = sqrt((*sum));
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_cnorm(
+	const T* _RESTRICT const x, const int n)
+{
+	return mpi_allreduce(cnorm<mem>(x, n), MPI_MAX, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_cnorm(
+	const T* _RESTRICT const x, const int n,
+	const MPI_Comm comm)
+{
+	return mpi_allreduce(cnorm<mem>(x, n), MPI_MAX, comm);
+}
+
+template< typename T, typename CType >
+inline T nse::mpi_cnorm_ifeq(const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+	return mpi_allreduce(cnorm_ifeq(x, mask, check, n), MPI_MAX, MPI_COMM_WORLD);
+}
+
+template< typename T, typename CType >
+inline T nse::mpi_cnorm_ifeq(const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	const MPI_Comm comm)
+{
+	return mpi_allreduce(cnorm_ifeq(x, mask, check, n), MPI_MAX, comm);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_cnorm_and_dp(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+	cnorm_and_dp<mem>(x, y, n, norm, dp);
+
+	mpi_allreduce(norm, MPI_MAX, MPI_COMM_WORLD);
+	mpi_allreduce(dp, MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_cnorm_and_dp(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+	const MPI_Comm comm,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+	cnorm_and_dp<mem>(x, y, n, norm, dp);
+
+	mpi_allreduce(norm, MPI_MAX, comm);
+	mpi_allreduce(dp, MPI_SUM, comm);
+}
+
+template< typename T, typename CType >
+inline void nse::mpi_cnorm_and_dp_ifeq(
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+	cnorm_and_dp_ifeq(x, y, mask, check, n, norm, dp);
+
+	mpi_allreduce(norm, MPI_MAX, MPI_COMM_WORLD);
+	mpi_allreduce(dp, MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< typename T, typename CType >
+inline void nse::mpi_cnorm_and_dp_ifeq(
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	const MPI_Comm comm,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+	cnorm_and_dp_ifeq(x, y, mask, check, n, norm, dp);
+
+	mpi_allreduce(norm, MPI_MAX, comm);
+	mpi_allreduce(dp, MPI_SUM, comm);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_cnorm_and_sqr_sum(
+	const T* _RESTRICT const x, const int n,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+	cnorm_and_sqr_sum<mem>(x, n, norm, sum);
+
+	mpi_allreduce(norm, MPI_MAX, MPI_COMM_WORLD);
+	mpi_allreduce(sum, MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mpi_cnorm_and_sqr_sum(
+	const T* _RESTRICT const x, const int n,
+	const MPI_Comm comm,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+	cnorm_and_sqr_sum<mem>(x, n, norm, sum);
+
+	mpi_allreduce(norm, MPI_MAX, comm);
+	mpi_allreduce(sum, MPI_SUM, comm);
+}
+
+template< typename T, typename CType >
+inline void nse::mpi_cnorm_and_sqr_sum_ifeq(
+	const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+	cnorm_and_sqr_sum_ifeq(x, mask, check, n, norm, sum);
+
+	mpi_allreduce(norm, MPI_MAX, MPI_COMM_WORLD);
+	mpi_allreduce(sum, MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< typename T, typename CType >
+inline void nse::mpi_cnorm_and_sqr_sum_ifeq(
+	const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	const MPI_Comm comm,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+	cnorm_and_sqr_sum_ifeq(x, mask, check, n, norm, sum);
+
+	mpi_allreduce(norm, MPI_MAX, comm);
+	mpi_allreduce(sum, MPI_SUM, comm);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_l1norm(
+	const T* _RESTRICT const x, const int n)
+{
+	return mpi_allreduce(l1norm<mem>(x, n), MPI_SUM, MPI_COMM_WORLD);
+}
+
+template< nse::memType mem, typename T >
+inline T nse::mpi_l1norm(
+	const T* _RESTRICT const x, const int n,
+	const MPI_Comm comm)
+{
+	return mpi_allreduce(l1norm<mem>(x, n), MPI_SUM, comm);
+}
+
+template< typename T >
+inline T nse::mpi_max_deviation(
+	const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n)
+{
+	return mpi_allreduce(max_deviation(x, sqrx, n), MPI_MAX, MPI_COMM_WORLD);
+}
+
+template< typename T >
+inline T nse::mpi_max_deviation(
+	const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n,
+	const MPI_Comm comm)
+{
+	return mpi_allreduce(max_deviation(x, sqrx, n), MPI_MAX, comm);
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::mpi_is_finite(const T* _RESTRICT const x, const int n)
+{
+	int num_inf = 0;
+	if (!is_finite<mem>(x, n)) num_inf++;
+
+	return (mpi_allreduce(num_inf, MPI_SUM, MPI_COMM_WORLD) == 0);
+	
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::mpi_is_finite(const T* _RESTRICT const x, const int n,
+	const MPI_Comm comm)
+{
+	int num_inf = 0;
+	if (!is_finite<mem>(x, n)) num_inf++;
+
+	return (mpi_allreduce(num_inf, MPI_SUM, comm) == 0);
+}
diff --git a/mtrand.cpp b/mtrand.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b074668bd2b9684c4ad940e410c40d138d88d1a
--- /dev/null
+++ b/mtrand.cpp
@@ -0,0 +1,124 @@
+// mtrand.cpp, see include file mtrand.h for information
+
+#include "mtrand.h"
+// non-inline function definitions and static member definitions cannot
+// reside in header file because of the risk of multiple declarations
+
+// initialization of static private members
+unsigned long MTRand_int32::state[n] = { 0x0UL };
+int MTRand_int32::p = 0;
+bool MTRand_int32::init = false;
+
+void MTRand_int32::gen_state() { // generate new state vector
+	for (int i = 0; i < (n - m); ++i)
+		state[i] = state[i + m] ^ twiddle(state[i], state[i + 1]);
+	for (int i = n - m; i < (n - 1); ++i)
+		state[i] = state[i + m - n] ^ twiddle(state[i], state[i + 1]);
+	state[n - 1] = state[m - 1] ^ twiddle(state[n - 1], state[0]);
+	p = 0; // reset position
+}
+
+void MTRand_int32::seed(unsigned long s) {  // init by 32 bit seed
+	state[0] = s & 0xFFFFFFFFUL; // for > 32 bit machines
+	for (int i = 1; i < n; ++i) {
+		state[i] = 1812433253UL * (state[i - 1] ^ (state[i - 1] >> 30)) + i;
+		// see Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier
+		// in the previous versions, MSBs of the seed affect only MSBs of the array state
+		// 2002/01/09 modified by Makoto Matsumoto
+		state[i] &= 0xFFFFFFFFUL; // for > 32 bit machines
+	}
+	p = n; // force gen_state() to be called for next random number
+}
+
+void MTRand_int32::seed(const unsigned long* array, int size) { // init by array
+	seed(19650218UL);
+	int i = 1, j = 0;
+	for (int k = ((n > size) ? n : size); k; --k) {
+		state[i] = (state[i] ^ ((state[i - 1] ^ (state[i - 1] >> 30)) * 1664525UL))
+			+ array[j] + j; // non linear
+		state[i] &= 0xFFFFFFFFUL; // for > 32 bit machines
+		++j; j %= size;
+		if ((++i) == n) { state[0] = state[n - 1]; i = 1; }
+	}
+	for (int k = n - 1; k; --k) {
+		state[i] = (state[i] ^ ((state[i - 1] ^ (state[i - 1] >> 30)) * 1566083941UL)) - i;
+		state[i] &= 0xFFFFFFFFUL; // for > 32 bit machines
+		if ((++i) == n) { state[0] = state[n - 1]; i = 1; }
+	}
+	state[0] = 0x80000000UL; // MSB is 1; assuring non-zero initial array
+	p = n; // force gen_state() to be called for next random number
+}
+
+
+// Gauss random-number generator //
+namespace nse
+{
+	GaussRand::GaussRand() : seed(0), mean((double)0), variance((double)0) {
+	}
+	GaussRand::~GaussRand() {
+	}
+	GaussRand::GaussRand(const GaussRand& gen) :
+		mean(gen.mean), variance(gen.variance),
+		seed(gen.seed)
+	{
+		mt.seed(seed);
+	}
+
+	void GaussRand::set(const double _mean, const double _variance,
+		const long int _seed)
+	{
+		seed = _seed;
+		mean = _mean; variance = _variance;
+
+		mt.seed(seed);
+	}
+
+	double GaussRand::s_rand()
+	{
+		double u, v;
+		double sqr_sum;
+
+		do
+		{
+			u = 2 * uni_rand() - 1;
+			v = 2 * uni_rand() - 1;
+
+			sqr_sum = u * u + v * v;
+		} while (sqr_sum >= (double) 1.0);
+
+		return mean + sqrt(variance) * sqrt(-(double)2.0 * log(sqr_sum) / sqr_sum) * u;
+	}
+
+	double GaussRand::mt_rand()
+	{
+		double u, v;
+		double sqr_sum;
+
+		do
+		{
+			u = 2 * mt() - 1;
+			v = 2 * mt() - 1;
+
+			sqr_sum = u * u + v * v;
+		} while (sqr_sum >= (double) 1.0);
+
+		return mean + sqrt(variance) * sqrt(-(double)2.0 * log(sqr_sum) / sqr_sum) * u;
+	}
+
+	double GaussRand::uni_rand()
+	{
+		const long int a = 48271;
+		const long int m = 2147483647;
+		const long int q = (m / a);
+		const long int r = (m % a);
+
+		long int hi = seed / q;
+		long int lo = seed % q;
+		long int test = a * lo - r * hi;
+		if (test > 0)
+			seed = test;
+		else
+			seed = test + m;
+		return (double)seed / m;
+	}
+}
diff --git a/mtrand.h b/mtrand.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3bc3f2f0178b1446034fbea95a86ab4da698d5c
--- /dev/null
+++ b/mtrand.h
@@ -0,0 +1,190 @@
+// mtrand.h
+// C++ include file for MT19937, with initialization improved 2002/1/26.
+// Coded by Takuji Nishimura and Makoto Matsumoto.
+// Ported to C++ by Jasper Bedaux 2003/1/1 (see http://www.bedaux.net/mtrand/).
+// The generators returning floating point numbers are based on
+// a version by Isaku Wada, 2002/01/09
+//
+// Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// 3. The names of its contributors may not be used to endorse or promote
+//    products derived from this software without specific prior written
+//    permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Any feedback is very welcome.
+// http://www.math.keio.ac.jp/matumoto/emt.html
+// email: matumoto@math.keio.ac.jp
+//
+// Feedback about the C++ port should be sent to Jasper Bedaux,
+// see http://www.bedaux.net/mtrand/ for e-mail address and info.
+
+#ifndef MTRAND_H
+#define MTRAND_H
+
+#include <math.h>
+
+class MTRand_int32 { // Mersenne Twister random number generator
+public:
+	// default constructor: uses default seed only if this is the first instance
+	MTRand_int32() { if (!init) seed(5489UL); init = true; }
+	// constructor with 32 bit int as seed
+	MTRand_int32(unsigned long s) { seed(s); init = true; }
+	// constructor with array of size 32 bit ints as seed
+	MTRand_int32(const unsigned long* array, int size) { seed(array, size); init = true; }
+	// the two seed functions
+	void seed(unsigned long); // seed with 32 bit integer
+	void seed(const unsigned long*, int size); // seed with array
+											   // overload operator() to make this a generator (functor)
+	unsigned long operator()() { return rand_int32(); }
+	// 2007-02-11: made the destructor virtual; thanks "double more" for pointing this out
+	virtual ~MTRand_int32() {} // destructor
+protected: // used by derived classes, otherwise not accessible; use the ()-operator
+	unsigned long rand_int32(); // generate 32 bit random integer
+private:
+	static const int n = 624, m = 397; // compile time constants
+									   // the variables below are static (no duplicates can exist)
+	static unsigned long state[n]; // state vector array
+	static int p; // position in state array
+	static bool init; // true if init function is called
+					  // private functions used to generate the pseudo random numbers
+	unsigned long twiddle(unsigned long, unsigned long); // used by gen_state()
+	void gen_state(); // generate new state
+					  // make copy constructor and assignment operator unavailable, they don't make sense
+	MTRand_int32(const MTRand_int32&); // copy constructor not defined
+	void operator=(const MTRand_int32&); // assignment operator not defined
+};
+
+// inline for speed, must therefore reside in header file
+inline unsigned long MTRand_int32::twiddle(unsigned long u, unsigned long v) {
+	return (((u & 0x80000000UL) | (v & 0x7FFFFFFFUL)) >> 1)
+		^ ((v & 1UL) * 0x9908B0DFUL);
+	// 2013-07-22: line above modified for performance according to http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/Ierymenko.html
+	// thanks Vitaliy FEOKTISTOV for pointing this out
+}
+
+inline unsigned long MTRand_int32::rand_int32() { // generate 32 bit random int
+	if (p == n) gen_state(); // new state vector needed
+							 // gen_state() is split off to be non-inline, because it is only called once
+							 // in every 624 calls and otherwise irand() would become too big to get inlined
+	unsigned long x = state[p++];
+	x ^= (x >> 11);
+	x ^= (x << 7) & 0x9D2C5680UL;
+	x ^= (x << 15) & 0xEFC60000UL;
+	return x ^ (x >> 18);
+}
+
+// generates double floating point numbers in the half-open interval [0, 1)
+class MTRand : public MTRand_int32 {
+public:
+	MTRand() : MTRand_int32() {}
+	MTRand(unsigned long seed) : MTRand_int32(seed) {}
+	MTRand(const unsigned long* seed, int size) : MTRand_int32(seed, size) {}
+	~MTRand() {}
+	double operator()() {
+		return static_cast<double>(rand_int32()) * (1. / 4294967296.);
+	} // divided by 2^32
+private:
+	MTRand(const MTRand&); // copy constructor not defined
+	void operator=(const MTRand&); // assignment operator not defined
+};
+
+// generates double floating point numbers in the closed interval [0, 1]
+class MTRand_closed : public MTRand_int32 {
+public:
+	MTRand_closed() : MTRand_int32() {}
+	MTRand_closed(unsigned long seed) : MTRand_int32(seed) {}
+	MTRand_closed(const unsigned long* seed, int size) : MTRand_int32(seed, size) {}
+	~MTRand_closed() {}
+	double operator()() {
+		return static_cast<double>(rand_int32()) * (1. / 4294967295.);
+	} // divided by 2^32 - 1
+private:
+	MTRand_closed(const MTRand_closed&); // copy constructor not defined
+	void operator=(const MTRand_closed&); // assignment operator not defined
+};
+
+// generates double floating point numbers in the open interval (0, 1)
+class MTRand_open : public MTRand_int32 {
+public:
+	MTRand_open() : MTRand_int32() {}
+	MTRand_open(unsigned long seed) : MTRand_int32(seed) {}
+	MTRand_open(const unsigned long* seed, int size) : MTRand_int32(seed, size) {}
+	~MTRand_open() {}
+	double operator()() {
+		return (static_cast<double>(rand_int32()) + .5) * (1. / 4294967296.);
+	} // divided by 2^32
+private:
+	MTRand_open(const MTRand_open&); // copy constructor not defined
+	void operator=(const MTRand_open&); // assignment operator not defined
+};
+
+// generates 53 bit resolution doubles in the half-open interval [0, 1)
+class MTRand53 : public MTRand_int32 {
+public:
+	MTRand53() : MTRand_int32() {}
+	MTRand53(unsigned long seed) : MTRand_int32(seed) {}
+	MTRand53(const unsigned long* seed, int size) : MTRand_int32(seed, size) {}
+	~MTRand53() {}
+	double operator()() {
+		return (static_cast<double>(rand_int32() >> 5) * 67108864. +
+			static_cast<double>(rand_int32() >> 6)) * (1. / 9007199254740992.);
+	}
+private:
+	MTRand53(const MTRand53&); // copy constructor not defined
+	void operator=(const MTRand53&); // assignment operator not defined
+};
+
+
+// Gauss Random-number generator //
+namespace nse {
+	class GaussRand
+	{
+	public:
+
+		void set(const double mean, const double variance,
+			const long int seed);
+
+		double s_rand();	// use simple uniform distribution in [0,1]
+		double mt_rand();	// use Mersenne Twister uniform distribution in [0,1]
+
+		GaussRand(const GaussRand& grand);
+		GaussRand();
+		~GaussRand();
+
+	private:
+		double uni_rand();		// simple uniform distribution in [0,1]
+
+
+		double mean, variance;
+		long int seed;
+
+		MTRand_closed mt;	// Mersenne Twister uniform distribution in [0,1]
+	};
+}
+
+
+#endif // MTRAND_H
diff --git a/nse-alloc.h b/nse-alloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..a392f0b7766402f7365ae495ccc514e004171627
--- /dev/null
+++ b/nse-alloc.h
@@ -0,0 +1,657 @@
+#pragma once
+
+// [nse-alloc.h]: memory allocation
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include <omp.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef EXCLUDE_GPU_BRANCH
+#include "cuda-alloc.cuh"
+#endif
+
+
+// * allocation & memory copy calls * //
+namespace nse
+{
+	enum memType { memCPU = 0, memGPU = 1 };
+
+	template< memType mem = memCPU >
+	bool allocate_void(void** x, const int n);
+	template< memType mem = memCPU >
+	void deallocate_void(void* x);
+
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, const int n);
+
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, const int nx, const int ny);
+	
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z, const int nx, const int ny, const int nz);
+
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z, T** p, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z, T** p, T** q, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z, 
+		T** p, T** q, T** s, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z,
+		T** p, T** q, T** s, T** u, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z,
+		T** p, T** q, T** s, T** u, T** v, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z,
+		T** p, T** q, T** s, T** u, T** v, T** w, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z, T** p, T** q, T** s, 
+		T** u, T** v, T** w, T** a, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z, T** p, T** q, T** s,
+		T** u, T** v, T** w, T** a, T** b, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate(T** x, T** y, T** z, T** p, T** q, T** s, 
+		T** u, T** v, T** w, T** a, T** b, T** c, const int n);
+
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y, T* z);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y, T* z, T* p);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y, T* z, T* p, T* q);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y, T* z, 
+		T* p, T* q, T* s);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y, T* z, 
+		T* p, T* q, T* s, T* u);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y, T* z,
+		T* p, T* q, T* s, T* u, T* v);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y, T* z,
+		T* p, T* q, T* s, T* u, T* v, T* w);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y, T* z, T* p, T* q, T* s, 
+		T* u, T* v, T* w, T* a);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y, T* z, T* p, T* q, T* s,
+		T* u, T* v, T* w, T* a, T* b);
+	template< memType mem = memCPU, typename T >
+	void deallocate(T* x, T* y, T* z, T* p, T* q, T* s,
+		T* u, T* v, T* w, T* a, T* b, T* c);
+
+	template< memType memDEST = memCPU, memType memSRC = memCPU, typename T >
+	void mcopy(T* _RESTRICT dest, const T* _RESTRICT const src, const int n);
+	template< typename T >
+	void mcopy_omp(T* _RESTRICT dest, const T* _RESTRICT const src, const int n);
+
+	template< typename T, typename CType >
+	void mcopy_ifeq(T* _RESTRICT dest, const T* _RESTRICT const src,
+		const CType* _RESTRICT mask, const CType check, const int n);
+
+	// * resize * //
+	template< typename T >
+	bool reallocate(T** x, const int oldsize, const int newsize);
+	template< typename T >
+	bool reallocate(T** x, const int usedsize, const int oldsize, const int newsize);
+
+	// * swap  * //
+	template< typename T > 
+	void swap_vars(T& a, T& b);
+}
+// ----------------------------------------------------------------------------------------- //
+
+// ----------------------------------------------------------------------------------------- //
+// * Implementation *
+// ----------------------------------------------------------------------------------------- //
+template< nse::memType mem >
+inline bool nse::allocate_void(
+	void** x, const int n)
+{
+	// CPU-allocation //
+	if (mem == memCPU) {
+#ifndef ALIGN_ALLOCATION
+		(*x) = malloc(n);
+		if (*x == NULL) return false;
+#else
+#if defined(__INTEL_COMPILER)
+		(*x) = _mm_malloc(n, ALIGN_ALLOCATION);
+		if (*x == NULL) return false;
+#elif defined(_MSC_VER)
+		(*x) = _aligned_malloc(n, ALIGN_ALLOCATION);
+		if (*x == NULL) return false;
+#else // die or use posix //
+		if (posix_memalign(x, ALIGN_ALLOCATION, n) != 0)
+			return false;
+#endif
+#endif
+	}
+
+#ifndef EXCLUDE_GPU_BRANCH
+	// GPU-allocation //
+	if (mem == memGPU) {
+		if (!nse_gpu::allocate_void(x, n)) return false;
+	}
+#endif
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(
+	T** x, const int n)
+{
+	// CPU-allocation //
+	if (mem == memCPU) {
+#ifndef ALIGN_ALLOCATION
+		(*x) = new T[n];
+#else
+#if defined(__INTEL_COMPILER)
+		(*x) = (T*)_mm_malloc(
+			n * sizeof(T), ALIGN_ALLOCATION);
+		if (*x == NULL) return false;
+#elif defined(_MSC_VER)
+		(*x) = (T*)_aligned_malloc(
+			n * sizeof(T), ALIGN_ALLOCATION);
+		if (*x == NULL) return false;
+#else // die or use posix //
+		if (posix_memalign((void**)x, ALIGN_ALLOCATION, n * sizeof(T)) != 0)
+			return false;
+#endif
+#endif
+	}
+
+#ifndef EXCLUDE_GPU_BRANCH
+	// GPU-allocation //
+	if (mem == memGPU) {
+		if (!nse_gpu::allocate(x, n)) return false;
+	}
+#endif
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(
+	T** x, T** y, const int n)
+{
+	if (!allocate<mem>(x, n)) return false;
+	if (!allocate<mem>(y, n)) {
+		deallocate<mem>(*x);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(
+	T** x, T** y, const int nx, const int ny)
+{
+	if (!allocate<mem>(x, nx)) return false;
+	if (!allocate<mem>(y, ny)) {
+		deallocate<mem>(*x);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(
+	T** x, T** y, T** z, const int n)
+{
+	if (!allocate<mem>(x, n)) return false;
+	if (!allocate<mem>(y, n)) {
+		deallocate<mem>(*x);
+		return false;
+	}
+	if (!allocate<mem>(z, n)) {
+		deallocate<mem>(*x);
+		deallocate<mem>(*y);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(
+	T** x, T** y, T** z, const int nx, const int ny, const int nz)
+{
+	if (!allocate<mem>(x, nx)) return false;
+	if (!allocate<mem>(y, ny)) {
+		deallocate<mem>(*x);
+		return false;
+	}
+	if (!allocate<mem>(z, nz)) {
+		deallocate<mem>(*x);
+		deallocate<mem>(*y);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(
+	T** x, T** y, T** z, T** p, const int n)
+{
+	if (!allocate<mem>(x, n)) return false;
+	if (!allocate<mem>(y, n)) {
+		deallocate<mem>(*x);
+		return false;
+	}
+	if (!allocate<mem>(z, n)) {
+		deallocate<mem>(*x);
+		deallocate<mem>(*y);
+		return false;
+	}
+	if (!allocate<mem>(p, n)) {
+		deallocate<mem>(*x);
+		deallocate<mem>(*y);
+		deallocate<mem>(*z);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(
+	T** x, T** y, T** z, T** p, T** q, const int n)
+{
+	if (!allocate<mem>(x, n)) return false;
+	if (!allocate<mem>(y, n)) {
+		deallocate<mem>(*x);
+		return false;
+	}
+	if (!allocate<mem>(z, n)) {
+		deallocate<mem>(*x);
+		deallocate<mem>(*y);
+		return false;
+	}
+	if (!allocate<mem>(p, n)) {
+		deallocate<mem>(*x);
+		deallocate<mem>(*y);
+		deallocate<mem>(*z);
+		return false;
+	}
+	if (!allocate<mem>(q, n)) {
+		deallocate<mem>(*x);
+		deallocate<mem>(*y);
+		deallocate<mem>(*z);
+		deallocate<mem>(*p);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(T** x, T** y, T** z,
+	T** p, T** q, T** s, const int n)
+{
+	if (!allocate<mem>(x, y, z, n)) return false;
+	if (!allocate<mem>(p, q, s, n)) {
+		deallocate<mem>(*x, *y, *z);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(T** x, T** y, T** z,
+	T** p, T** q, T** s, T** u, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, n)) return false;
+	if (!allocate<mem>(u, n)) {
+		deallocate<mem>(*x, *y, *z, *p, *q, *s);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(T** x, T** y, T** z,
+	T** p, T** q, T** s, T** u, T** v, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, n)) return false;
+	if (!allocate<mem>(u, v, n)) {
+		deallocate<mem>(*x, *y, *z, *p, *q, *s);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(T** x, T** y, T** z,
+	T** p, T** q, T** s, T** u, T** v, T** w, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, n)) return false;
+	if (!allocate<mem>(u, v, w, n)) {
+		deallocate<mem>(*x, *y, *z, *p, *q, *s);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(
+	T** x, T** y, T** z, T** p, T** q, T** s, 
+	T** u, T** v, T** w, T** a, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, u, v, w, n)) return false;
+	if (!allocate<mem>(a, n)) {
+		deallocate<mem>(*x, *y, *z, *p, *q, *s, *u, *v, *w);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(
+	T** x, T** y, T** z, T** p, T** q, T** s,
+	T** u, T** v, T** w, T** a, T** b, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, u, v, w, n)) return false;
+	if (!allocate<mem>(a, b, n)) {
+		deallocate<mem>(*x, *y, *z, *p, *q, *s, *u, *v, *w);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate(
+	T** x, T** y, T** z, T** p, T** q, T** s,
+	T** u, T** v, T** w, T** a, T** b, T** c, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, u, v, w, n)) return false;
+	if (!allocate<mem>(a, b, c, n)) {
+		deallocate<mem>(*x, *y, *z, *p, *q, *s, *u, *v, *w);
+		return false;
+	}
+
+	return true;
+}
+
+template< nse::memType mem >
+inline void nse::deallocate_void(
+	void* x)
+{
+	// CPU-deallocation //
+	if (mem == memCPU) {
+#ifndef ALIGN_ALLOCATION
+		free(x);
+#else
+#if defined(__INTEL_COMPILER)
+		_mm_free(x);
+#elif defined(_MSC_VER)
+		_aligned_free(x);
+#else	// die or use posix //
+		free(x);
+#endif
+#endif
+	}
+
+#ifndef EXCLUDE_GPU_BRANCH
+	// GPU-deallocation //
+	if (mem == memGPU) {
+		nse_gpu::deallocate_void(x);
+	}
+#endif
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x)
+{
+	// CPU-deallocation //
+	if (mem == memCPU) {
+#ifndef ALIGN_ALLOCATION
+		delete[] x;
+#else
+#if defined(__INTEL_COMPILER)
+		_mm_free((void*)x);
+#elif defined(_MSC_VER)
+		_aligned_free((void*)x);
+#else	// die or use posix //
+		free((void*)x);
+#endif
+#endif
+	}
+
+#ifndef EXCLUDE_GPU_BRANCH
+	// GPU-deallocation //
+	if (mem == memGPU) {
+		nse_gpu::deallocate(x);
+	}
+#endif
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y)
+{
+	deallocate<mem>(x);
+	deallocate<mem>(y);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y, T* z)
+{
+	deallocate<mem>(x);
+	deallocate<mem>(y);
+	deallocate<mem>(z);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y, T* z, T* p)
+{
+	deallocate<mem>(x);
+	deallocate<mem>(y);
+	deallocate<mem>(z);
+	deallocate<mem>(p);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y, T* z, T* p, T* q)
+{
+	deallocate<mem>(x);
+	deallocate<mem>(y);
+	deallocate<mem>(z);
+	deallocate<mem>(p);
+	deallocate<mem>(q);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y, T* z, T* p, T* q, T* s)
+{
+	deallocate<mem>(x, y, z);
+	deallocate<mem>(p, q, s);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y, T* z, 
+	T* p, T* q, T* s, T* u)
+{
+	deallocate<mem>(x, y, z, p, q, s);
+	deallocate<mem>(u);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y, T* z,
+	T* p, T* q, T* s, T* u, T* v)
+{
+	deallocate<mem>(x, y, z, p, q, s);
+	deallocate<mem>(u, v);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y, T* z,
+	T* p, T* q, T* s, T* u, T* v, T* w)
+{
+	deallocate<mem>(x, y, z, p, q, s);
+	deallocate<mem>(u, v, w);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y, T* z, T* p, T* q, T* s, 
+	T* u, T* v, T* w, T* a)
+{
+	deallocate<mem>(x, y, z, p, q, s, u, v, w);
+	deallocate<mem>(a);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y, T* z, T* p, T* q, T* s,
+	T* u, T* v, T* w, T* a, T* b)
+{
+	deallocate<mem>(x, y, z, p, q, s, u, v, w);
+	deallocate<mem>(a, b);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::deallocate(
+	T* x, T* y, T* z, T* p, T* q, T* s,
+	T* u, T* v, T* w, T* a, T* b, T* c)
+{
+	deallocate<mem>(x, y, z, p, q, s, u, v, w);
+	deallocate<mem>(a, b, c);
+}
+
+template< nse::memType memDEST, nse::memType memSRC, typename T >
+inline void nse::mcopy(T* _RESTRICT dest, const T* _RESTRICT const src, const int n)
+{
+	if ((memDEST == memCPU) && (memSRC == memCPU)) {
+		if (omp_in_parallel()) {
+			mcopy_omp(dest, src, n);
+		}
+		else
+		{
+			if (n > MIN_MEMCPY_BLOCK) memcpy(dest, src, n * sizeof(T));
+			else
+			{
+#pragma omp parallel shared(dest)
+				{
+					mcopy_omp(dest, src, n);
+				}
+			}
+		}
+	}
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if ((memDEST == memGPU) && (memSRC == memCPU)) nse_gpu::copy_HostToDev(dest, src, n);
+	if ((memDEST == memCPU) && (memSRC == memGPU)) nse_gpu::copy_DevToHost(dest, src, n);
+	if ((memDEST == memGPU) && (memSRC == memGPU)) nse_gpu::copy_DevToDev(dest, src, n);
+#endif
+}
+
+template< typename T >
+inline void nse::mcopy_omp(T* _RESTRICT dest, const T* _RESTRICT const src, const int n)
+{
+	int i;
+
+#pragma omp for nowait
+	for (i = 0; i < n - (n % 4); i += 4) {
+		dest[i] = src[i];
+		dest[i + 1] = src[i + 1];
+		dest[i + 2] = src[i + 2];
+		dest[i + 3] = src[i + 3];
+	}
+
+#pragma omp single nowait
+	for (i = n - (n % 4); i < n; i++)
+		dest[i] = src[i];
+}
+
+template< typename T, typename CType >
+inline void nse::mcopy_ifeq(T* _RESTRICT dest, const T* _RESTRICT const src,
+	const CType* _RESTRICT mask, const CType check, const int n)
+{
+	int i;
+
+#pragma omp for nowait
+	for (i = 0; i < n - (n % 4); i += 4) {
+		if (mask[i] == check) dest[i] = src[i];
+		if (mask[i + 1] == check) dest[i + 1] = src[i + 1];
+		if (mask[i + 2] == check) dest[i + 2] = src[i + 2];
+		if (mask[i + 3] == check) dest[i + 3] = src[i + 3];
+	}
+
+#pragma omp single nowait
+	for (i = n - (n % 4); i < n; i++)
+		if (mask[i] == check) dest[i] = src[i];
+}
+
+// * resize * //
+template< typename T >
+inline bool nse::reallocate(
+	T** x, const int oldsize, const int newsize)
+{
+	T *mem;
+	if (!allocate(&mem, newsize)) return false;
+
+	int cpsize = (newsize < oldsize) ? newsize : oldsize;
+	mcopy(mem, *x, cpsize);
+
+	if (oldsize > 0) deallocate(*x);
+	*x = mem;
+
+	return true;
+}
+
+template< typename T >
+inline bool nse::reallocate(
+	T** x, const int usedsize, const int oldsize, const int newsize)
+{
+	T *mem;
+	if (!allocate(&mem, newsize)) return false;
+
+	int cpsize = (newsize < usedsize) ? newsize : usedsize;
+	mcopy(mem, *x, cpsize);
+
+	if (oldsize > 0) deallocate(*x);
+	*x = mem;
+
+	return true;
+}
+
+// * swap  * //
+template< typename T > 
+inline void nse::swap_vars(T& a, T& b)
+{
+	T c = a; a = b; b = c;
+}
+// ----------------------------------------------------------------------------------------- //
diff --git a/nse-avg-vec.h b/nse-avg-vec.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f7a84c06e6b7599113eb9faf8b5412bc9554eba
--- /dev/null
+++ b/nse-avg-vec.h
@@ -0,0 +1,271 @@
+#pragma once
+
+// [nse-avg-vec.h]: nse average arrays container
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+#include "grid3d.h"
+#include "vecmath.h"
+
+
+template< typename T >
+struct nseAvgVec {
+
+	T *U, *V, *W, *P;					// velocity-pressure averages: [C, C, W, C]
+
+	T *U2_u, *V2_v, *W2_w;				// Ui^2: [C, C, W]
+	T *U2_uw, *V2_vw, *W2_c;			// Ui^2: [W, W, C]
+	T *W2_u, *W2_v, *W2_uw, *W2_vw;		// W^2: [C, C, W, W]
+
+	T *UV, *UW, *VW;					// Ui*Uj: [C, W, W]
+	T *UV_uvw, *UW_uvw, *VW_uvw;		// Ui*Uj: [W, W, W]
+	T *PU, *PV, *PW;					// P*Ui: [C, C, W]
+
+	T *UW_bottom, *UW_top;				// special approximations to U*W: [C (W -- C), C (W -- C)]
+	T *VW_bottom, *VW_top;				// special approximations to V*W: [C (W -- C), C (W -- C)]
+	T *UW_bottom_uv, *UW_top_uv;		// special approximations to U*W: [C (W -- C), C (W -- C)]
+	T *VW_bottom_uv, *VW_top_uv;		// special approximations to V*W: [C (W -- C), C (W -- C)]
+	T *UW_bottom_uw, *UW_top_uw;		// special approximations to U*W: [W (C -- W), W (C -- W)]
+	T *VW_bottom_vw, *VW_top_vw;		// special approximations to V*W: [W (C -- W), W (C -- W)]
+	T *UW_adv, *VW_adv;					// W*dU/dz, W*dV/dz: [W, W]
+
+	T *U2W, *V2W, *W2W;					// Ui*Ui*W: [W, W, C]
+	T *UVW, *UWW, *VWW;					// Ui*Uj*W: [W, C, C]
+
+	T *U_diss, *V_diss, *W_diss;					// dissipation: [C, C, W]
+	T *UV_diss, *UW_diss, *VW_diss;					// Ui*Uj dissipation: [C, W, W]
+
+	T *U_iso_diss, *V_iso_diss, *W_iso_diss;		// isotropic dissipation: [C, C, W]
+	T *UV_iso_diss, *UW_iso_diss, *VW_iso_diss;		// Ui*Uj iso-dissipation: [C, W, W]
+
+	T *PSuu, *PSvv, *PSww;			// P*dUi/dxi = P*Sii: [C, C, C]
+	T *P2Suv, *P2Suw, *P2Svw;		// P*2*Sij: [C, W, W]
+
+#ifdef COMPUTE_XT_AVERAGES
+	T *Uyz, *Vyz, *Wyz;				// large-scale motions: [C, V, W]
+	T *U2yz, *V2yz, *W2yz;			// (y,z) squares: [C, V, W]
+	T *UVyz, *UWyz, *VWyz;			// products(2nd order): [V, W, VW]
+#endif
+
+
+#ifdef STRATIFICATION
+	T *Tc;							// temperature-average: [C]
+	T *Tsh;							// average (with removed linear profile): [C]
+
+	T *T2_c, *T2_w;					// T*T: [C, W]
+	
+	T *TU, *TV, *TW,				// T*Ui: [C, C, W]
+		*TU_uw, *TV_vw,				// T*U, T*V: [W, W]
+		*TW_uw, *TW_vw;				// T*W: [W, W]
+	T *TP;							// T*P: [C]
+	
+	T *TW_bottom, *TW_top,			// special approximations to T*W: [C (W -- C), C (W -- C)]
+		*TW_bottom_u, *TW_top_u,	// special approximations to T*W: [C (W -- C), C (W -- C)]
+		*TW_bottom_v, *TW_top_v,	// special approximations to T*W: [C (W -- C), C (W -- C)]
+		*TW_bottom_w, *TW_top_w;	// special approximations to T*W: [W (C -- W), W (C -- W)]
+	T *TW_adv;						// W*dT/dz: [W]
+	
+	T *T2W;							// T*T*W: [W]
+	T *TUW, *TVW, *TWW;				// T*Ui*W: [W, W, C]
+
+	T *T_diss;							// T dissipation: [C]	
+	T *TU_diss, *TV_diss, *TW_diss;		// T*Ui dissipation: [C, C, W]
+
+	T *T_iso_diss;						// isotropic dissipation: [C]	
+
+	T *T_dPdx, *T_dPdy, *T_dPdz;	// T*grad(P): [C, C, W]
+
+#ifdef COMPUTE_XT_AVERAGES
+	T *Tyz;								// large-scale motions: [C]
+	T *T2yz;							// square: [C]
+	T *TWyz;							// products: [W]
+#endif
+#endif
+
+
+	bool status;		// allocation status, default: [false]
+
+
+	template< typename GType >
+	void init(const nse::Grid3d< GType >& grid);
+	void clear();
+
+	nseAvgVec();
+	~nseAvgVec();
+};
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+nseAvgVec< T >::nseAvgVec() : status(false) {}
+template< typename T >
+nseAvgVec< T >::~nseAvgVec() { clear(); }
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+template< typename GType >
+void nseAvgVec< T >::init(const nse::Grid3d< GType >& grid)
+{
+	nse::allocate_vnull(&U, &V, &W, grid.nz);
+	nse::allocate_vnull(&P, grid.nz);
+
+	nse::allocate_vnull(&U2_u, &V2_v, &W2_w, grid.nz);
+	nse::allocate_vnull(&U2_uw, &V2_vw, &W2_c, grid.nz);
+	nse::allocate_vnull(&W2_u, &W2_v, grid.nz);
+	nse::allocate_vnull(&W2_uw, &W2_vw, grid.nz);
+
+	nse::allocate_vnull(&UV, &UW, &VW, grid.nz);
+	nse::allocate_vnull(&UV_uvw, &UW_uvw, &VW_uvw, grid.nz);
+	nse::allocate_vnull(&PU, &PV, &PW, grid.nz);
+	
+	nse::allocate_vnull(&UW_bottom, &UW_top, grid.nz);
+	nse::allocate_vnull(&VW_bottom, &VW_top, grid.nz);
+	nse::allocate_vnull(&UW_bottom_uv, &UW_top_uv, grid.nz);
+	nse::allocate_vnull(&VW_bottom_uv, &VW_top_uv, grid.nz);
+	nse::allocate_vnull(&UW_bottom_uw, &UW_top_uw, grid.nz);
+	nse::allocate_vnull(&VW_bottom_vw, &VW_top_vw, grid.nz);
+	nse::allocate_vnull(&UW_adv, &VW_adv, grid.nz);
+	
+	nse::allocate_vnull(&U2W, &V2W, &W2W, grid.nz);
+	nse::allocate_vnull(&UVW, &UWW, &VWW, grid.nz);
+
+	nse::allocate_vnull(&U_diss, &V_diss, &W_diss, grid.nz);
+	nse::allocate_vnull(&UV_diss, &UW_diss, &VW_diss, grid.nz);
+
+	nse::allocate_vnull(&U_iso_diss, &V_iso_diss, &W_iso_diss, grid.nz);
+	nse::allocate_vnull(&UV_iso_diss, &UW_iso_diss, &VW_iso_diss, grid.nz);
+
+	nse::allocate_vnull(&PSuu, &PSvv, &PSww, grid.nz);
+	nse::allocate_vnull(&P2Suv, &P2Suw, &P2Svw, grid.nz);
+
+#ifdef COMPUTE_XT_AVERAGES
+	nse::allocate_vnull(&Uyz, &Vyz, &Wyz, grid.nyz);
+	nse::allocate_vnull(&U2yz, &V2yz, &W2yz, grid.nyz);
+	nse::allocate_vnull(&UVyz, &UWyz, &VWyz, grid.nyz);
+#endif
+
+
+#ifdef STRATIFICATION
+	nse::allocate_vnull(&Tc, grid.nz);
+	nse::allocate_vnull(&Tsh, grid.nz);
+
+	nse::allocate_vnull(&T2_c, &T2_w, grid.nz);
+
+	nse::allocate_vnull(&TU, &TV, &TW, grid.nz);
+	nse::allocate_vnull(&TU_uw, &TV_vw, grid.nz);
+	nse::allocate_vnull(&TW_uw, &TW_vw, grid.nz);
+	nse::allocate_vnull(&TP, grid.nz);
+
+	nse::allocate_vnull(&TW_bottom, &TW_top, grid.nz);
+	nse::allocate_vnull(
+		&TW_bottom_u, &TW_top_u, 
+		&TW_bottom_v, &TW_top_v, 
+		&TW_bottom_w, &TW_top_w, grid.nz);
+	nse::allocate_vnull(&TW_adv, grid.nz);
+
+	nse::allocate_vnull(&T2W, grid.nz);
+	nse::allocate_vnull(&TUW, &TVW, &TWW, grid.nz);
+
+	nse::allocate_vnull(&T_diss, grid.nz); 
+	nse::allocate_vnull(&TU_diss, &TV_diss, &TW_diss, grid.nz);
+
+	nse::allocate_vnull(&T_iso_diss, grid.nz);
+
+	nse::allocate_vnull(&T_dPdx, &T_dPdy, &T_dPdz, grid.nz);
+
+#ifdef COMPUTE_XT_AVERAGES
+	nse::allocate_vnull(&Tyz, grid.nyz);
+	nse::allocate_vnull(&T2yz, grid.nyz);
+	nse::allocate_vnull(&TWyz, grid.nyz);
+#endif
+#endif
+
+	status = true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nseAvgVec< T >::clear()
+{
+	if (status) {
+		nse::deallocate(U, V, W);
+		nse::deallocate(P);
+
+		nse::deallocate(U2_u, V2_v, W2_w);
+		nse::deallocate(U2_uw, V2_vw, W2_c);
+		nse::deallocate(W2_u, W2_v);
+		nse::deallocate(W2_uw, W2_vw);
+
+		nse::deallocate(UV, UW, VW);
+		nse::deallocate(UV_uvw, UW_uvw, VW_uvw);
+		nse::deallocate(PU, PV, PW);
+
+		nse::deallocate(UW_bottom, UW_top);
+		nse::deallocate(VW_bottom, VW_top);
+		nse::deallocate(UW_bottom_uv, UW_top_uv);
+		nse::deallocate(VW_bottom_uv, VW_top_uv);
+		nse::deallocate(UW_bottom_uw, UW_top_uw);
+		nse::deallocate(VW_bottom_vw, VW_top_vw);
+		nse::deallocate(UW_adv, VW_adv);
+
+		nse::deallocate(U2W, V2W, W2W);
+		nse::deallocate(UVW, UWW, VWW);
+
+		nse::deallocate(U_diss, V_diss, W_diss);
+		nse::deallocate(UV_diss, UW_diss, VW_diss);
+
+		nse::deallocate(U_iso_diss, V_iso_diss, W_iso_diss);
+		nse::deallocate(UV_iso_diss, UW_iso_diss, VW_iso_diss);
+
+		nse::deallocate(PSuu, PSvv, PSww);
+		nse::deallocate(P2Suv, P2Suw, P2Svw);
+
+#ifdef COMPUTE_XT_AVERAGES
+		nse::deallocate(Uyz, Vyz, Wyz);
+		nse::deallocate(U2yz, V2yz, W2yz);
+		nse::deallocate(UVyz, UWyz, VWyz);
+#endif
+
+
+#ifdef STRATIFICATION
+		nse::deallocate(Tc);
+		nse::deallocate(Tsh);
+
+		nse::deallocate(T2_c, T2_w);
+
+		nse::deallocate(TU, TV, TW);
+		nse::deallocate(TU_uw, TV_vw);
+		nse::deallocate(TW_uw, TW_vw);
+		nse::deallocate(TP);
+
+		nse::deallocate(TW_bottom, TW_top);
+		nse::deallocate(
+			TW_bottom_u, TW_top_u,
+			TW_bottom_v, TW_top_v,
+			TW_bottom_w, TW_top_w);
+		nse::deallocate(TW_adv);
+
+		nse::deallocate(T2W);
+		nse::deallocate(TUW, TVW, TWW);
+
+		nse::deallocate(T_diss);
+		nse::deallocate(TU_diss, TV_diss, TW_diss);
+
+		nse::deallocate(T_iso_diss);
+
+		nse::deallocate(T_dPdx, T_dPdy, T_dPdz);
+
+#ifdef COMPUTE_XT_AVERAGES
+		nse::deallocate(Tyz);
+		nse::deallocate(T2yz);
+		nse::deallocate(TWyz);
+#endif
+#endif
+
+
+		status = false;
+	}
+}
+// ------------------------------------------------------------------------------------------------ //
diff --git a/nse-bc3d.cpp b/nse-bc3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fcf5d68ef27395fba6c53096e7b68b37ec065a2a
--- /dev/null
+++ b/nse-bc3d.cpp
@@ -0,0 +1,1902 @@
+#include "nse-bc3d.h"
+
+using namespace nse::nse_const3d;
+
+// Implementation
+// ------------------------------------------------------------------------ //
+
+
+// Dirichlet Type Boundary Conditions
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::dirichlet_bc(T* _RESTRICT X, const T Rhs,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+	const wstGrid3d< T >& grid)
+{
+	const T c1 = (T) 8.0 / (T) 3.0,
+		c2 = -(T) 2.0,
+		c3 = (T) 1.0 / (T) 3.0;
+
+	const int is_west = (side == westSide) && (grid.mpi_com.rank_x == 0);
+	const int is_east = (side == eastSide) && (grid.mpi_com.rank_x == grid.mpi_com.size_x - 1);
+	const int is_south = (side == southSide) && (grid.mpi_com.rank_y == 0);
+	const int is_north = (side == northSide) && (grid.mpi_com.rank_y == grid.mpi_com.size_y - 1);
+	const int is_bottom = (side == bottomSide) && (grid.mpi_com.rank_z == 0);
+	const int is_top = (side == topSide) && (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1);
+
+	// * west side * //
+	if (is_west) {
+
+		int j, k, idx;
+		if (node == nodeU)	// staggered node //
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = grid.gcx * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = Rhs;
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.gcx - 1) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx + grid.nyz] + c3 * X[idx + (grid.nyz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx + grid.nyz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * east side * //
+	if (is_east) {
+
+		int j, k, idx;
+		if (node == nodeU)	// staggered node //
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = Rhs;
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx - grid.nyz] + c3 * X[idx - (grid.nyz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx - grid.nyz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * south side * //
+	if (is_south) {
+
+		int i, k, idx;
+		if (node == nodeV)	// staggered node //
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = Rhs;
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.gcy - 1) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx + grid.nz] + c3 * X[idx + (grid.nz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx + grid.nz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * north side * //
+	if (is_north) {
+
+		int i, k, idx;
+		if (node == nodeV)	// staggered node //
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = Rhs;
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx - grid.nz] + c3 * X[idx - (grid.nz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx - grid.nz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * bottom side * //
+	if (is_bottom) {
+
+		int i, j, idx;
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + grid.gcz;
+					X[idx] = Rhs;
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.gcz - 1);
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx + 1] + c3 * X[idx + 2];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx + 1];
+#endif
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * top side * //
+	if (is_top) {
+
+		int i, j, idx;
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+					X[idx] = Rhs;
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx - 1] + c3 * X[idx - 2];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx - 1];
+#endif
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+#ifndef BLOCK_GHOST_ZONE_EXTRAPOLATION
+	ghost_extrapolation< T >(X, side, node,
+		grid.nx, grid.ny, grid.nz, grid.gcx, grid.gcy, grid.gcz, 1, 1, 1,
+		grid.mpi_com);
+#endif
+}
+// ------------------------------------------------------------------------ //
+
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::dirichlet_bc(T* _RESTRICT X, const T* _RESTRICT Rhs,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+	const wstGrid3d< T >& grid)
+{
+	const T c1 = (T) 8.0 / (T) 3.0,
+		c2 = -(T) 2.0,
+		c3 = (T) 1.0 / (T) 3.0;
+
+	const int is_west = (side == westSide) && (grid.mpi_com.rank_x == 0);
+	const int is_east = (side == eastSide) && (grid.mpi_com.rank_x == grid.mpi_com.size_x - 1);
+	const int is_south = (side == southSide) && (grid.mpi_com.rank_y == 0);
+	const int is_north = (side == northSide) && (grid.mpi_com.rank_y == grid.mpi_com.size_y - 1);
+	const int is_bottom = (side == bottomSide) && (grid.mpi_com.rank_z == 0);
+	const int is_top = (side == topSide) && (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1);
+
+	// * west side * //
+	if (is_west) {
+
+		int j, k, idx;
+		if (node == nodeU)	// staggered node //
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = grid.gcx * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = Rhs[j * grid.nz + k];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.gcx - 1) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs[j * grid.nz + k] +
+						c2 * X[idx + grid.nyz] + c3 * X[idx + (grid.nyz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs[j * grid.nz + k] - X[idx + grid.nyz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * east side * //
+	if (is_east) {
+
+		int j, k, idx;
+		if (node == nodeU)	// staggered node //
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = Rhs[j * grid.nz + k];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs[j * grid.nz + k] +
+						c2 * X[idx - grid.nyz] + c3 * X[idx - (grid.nyz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs[j * grid.nz + k] - X[idx - grid.nyz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * south side * //
+	if (is_south) {
+
+		int i, k, idx;
+		if (node == nodeV)	// staggered node //
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = Rhs[i * grid.nz + k];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.gcy - 1) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs[i * grid.nz + k] +
+						c2 * X[idx + grid.nz] + c3 * X[idx + (grid.nz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs[i * grid.nz + k] - X[idx + grid.nz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * north side * //
+	if (is_north) {
+
+		int i, k, idx;
+		if (node == nodeV)	// staggered node //
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = Rhs[i * grid.nz + k];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs[i * grid.nz + k] +
+						c2 * X[idx - grid.nz] + c3 * X[idx - (grid.nz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs[i * grid.nz + k] - X[idx - grid.nz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * bottom side * //
+	if (is_bottom) {
+
+		int i, j, idx;
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + grid.gcz;
+					X[idx] = Rhs[i * grid.ny + j];
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.gcz - 1);
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs[i * grid.ny + j] +
+						c2 * X[idx + 1] + c3 * X[idx + 2];
+#else
+					X[idx] = (T)2.0 * Rhs[i * grid.ny + j] - X[idx + 1];
+#endif
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * top side * //
+	if (is_top) {
+
+		int i, j, idx;
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+					X[idx] = Rhs[i * grid.ny + j];
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs[i * grid.ny + j] +
+						c2 * X[idx - 1] + c3 * X[idx - 2];
+#else
+					X[idx] = (T)2.0 * Rhs[i * grid.ny + j] - X[idx - 1];
+#endif
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+#ifndef BLOCK_GHOST_ZONE_EXTRAPOLATION
+	ghost_extrapolation< T >(X, side, node,
+		grid.nx, grid.ny, grid.nz, grid.gcx, grid.gcy, grid.gcz, 1, 1, 1,
+		grid.mpi_com);
+#endif
+}
+// ------------------------------------------------------------------------ //
+
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::dirichlet_bc(T* _RESTRICT X, const T x_max,
+	const T p1_min, const T p1_max,
+	const T p2_min, const T p2_max,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+	const wstGrid3d< T >& grid)
+{
+	const T c1 = (T) 8.0 / (T) 3.0,
+		c2 = -(T) 2.0,
+		c3 = (T) 1.0 / (T) 3.0;
+
+	const T p_value = (T) 16.0 * x_max /
+		((p1_max - p1_min) * (p1_max - p1_min) *
+		(p2_max - p2_min) * (p2_max - p2_min));
+	T Rhs;
+
+	const int is_west = (side == westSide) && (grid.mpi_com.rank_x == 0);
+	const int is_east = (side == eastSide) && (grid.mpi_com.rank_x == grid.mpi_com.size_x - 1);
+	const int is_south = (side == southSide) && (grid.mpi_com.rank_y == 0);
+	const int is_north = (side == northSide) && (grid.mpi_com.rank_y == grid.mpi_com.size_y - 1);
+	const int is_bottom = (side == bottomSide) && (grid.mpi_com.rank_z == 0);
+	const int is_top = (side == topSide) && (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1);
+
+	// * west side * //
+	if (is_west) {
+
+		int j, k, idx;
+		T *cy = (node == nodeV) ? grid.ey : grid.py;
+		T *cz = (node == nodeW) ? grid.ez : grid.pz;
+		if (node == nodeU)	// staggered node //
+		{
+#pragma omp parallel for private( j, k, idx, Rhs ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = grid.gcx * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Rhs = (
+						(cy[j] < p1_min) || (cy[j] > p1_max) ||
+						(cz[k] < p2_min) || (cz[k] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cy[j] - p1_min) * (p1_max - cy[j]) *
+						(cz[k] - p2_min) * (p2_max - cz[k]);
+
+					X[idx] = Rhs;
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( j, k, idx, Rhs ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.gcx - 1) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Rhs = (
+						(cy[j] < p1_min) || (cy[j] > p1_max) ||
+						(cz[k] < p2_min) || (cz[k] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cy[j] - p1_min) * (p1_max - cy[j]) *
+						(cz[k] - p2_min) * (p2_max - cz[k]);
+
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx + grid.nyz] + c3 * X[idx + (grid.nyz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx + grid.nyz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * east side * //
+	if (is_east) {
+
+		int j, k, idx;
+		T *cy = (node == nodeV) ? grid.ey : grid.py;
+		T *cz = (node == nodeW) ? grid.ez : grid.pz;
+		if (node == nodeU)	// staggered node //
+		{
+#pragma omp parallel for private( j, k, idx, Rhs ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Rhs = (
+						(cy[j] < p1_min) || (cy[j] > p1_max) ||
+						(cz[k] < p2_min) || (cz[k] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cy[j] - p1_min) * (p1_max - cy[j]) *
+						(cz[k] - p2_min) * (p2_max - cz[k]);
+
+					X[idx] = Rhs;
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( j, k, idx, Rhs ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Rhs = (
+						(cy[j] < p1_min) || (cy[j] > p1_max) ||
+						(cz[k] < p2_min) || (cz[k] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cy[j] - p1_min) * (p1_max - cy[j]) *
+						(cz[k] - p2_min) * (p2_max - cz[k]);
+
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx - grid.nyz] + c3 * X[idx - (grid.nyz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx - grid.nyz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * south side * //
+	if (is_south) {
+
+		int i, k, idx;
+		T *cx = (node == nodeU) ? grid.ex : grid.px;
+		T *cz = (node == nodeW) ? grid.ez : grid.pz;
+		if (node == nodeV)	// staggered node //
+		{
+#pragma omp parallel for private( i, k, idx, Rhs ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Rhs = (
+						(cx[i] < p1_min) || (cx[i] > p1_max) ||
+						(cz[k] < p2_min) || (cz[k] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cx[i] - p1_min) * (p1_max - cx[i]) *
+						(cz[k] - p2_min) * (p2_max - cz[k]);
+
+					X[idx] = Rhs;
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( i, k, idx, Rhs ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.gcy - 1) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Rhs = (
+						(cx[i] < p1_min) || (cx[i] > p1_max) ||
+						(cz[k] < p2_min) || (cz[k] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cx[i] - p1_min) * (p1_max - cx[i]) *
+						(cz[k] - p2_min) * (p2_max - cz[k]);
+
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx + grid.nz] + c3 * X[idx + (grid.nz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx + grid.nz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * north side * //
+	if (is_north) {
+
+		int i, k, idx;
+		T *cx = (node == nodeU) ? grid.ex : grid.px;
+		T *cz = (node == nodeW) ? grid.ez : grid.pz;
+		if (node == nodeV)	// staggered node //
+		{
+#pragma omp parallel for private( i, k, idx, Rhs ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Rhs = (
+						(cx[i] < p1_min) || (cx[i] > p1_max) ||
+						(cz[k] < p2_min) || (cz[k] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cx[i] - p1_min) * (p1_max - cx[i]) *
+						(cz[k] - p2_min) * (p2_max - cz[k]);
+
+					X[idx] = Rhs;
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( i, k, idx, Rhs ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Rhs = (
+						(cx[i] < p1_min) || (cx[i] > p1_max) ||
+						(cz[k] < p2_min) || (cz[k] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cx[i] - p1_min) * (p1_max - cx[i]) *
+						(cz[k] - p2_min) * (p2_max - cz[k]);
+
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx - grid.nz] + c3 * X[idx - (grid.nz << 1)];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx - grid.nz];
+#endif
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * bottom side * //
+	if (is_bottom) {
+
+		int i, j, idx;
+		T *cx = (node == nodeU) ? grid.ex : grid.px;
+		T *cy = (node == nodeV) ? grid.ey : grid.py;
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx, Rhs ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx, Rhs ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+
+					idx = i * grid.nyz + j * grid.nz + grid.gcz;
+
+					Rhs = (
+						(cx[i] < p1_min) || (cx[i] > p1_max) ||
+						(cy[j] < p2_min) || (cy[j] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cx[i] - p1_min) * (p1_max - cx[i]) *
+						(cy[j] - p2_min) * (p2_max - cy[j]);
+
+					X[idx] = Rhs;
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx, Rhs ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx, Rhs ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+
+					idx = i * grid.nyz + j * grid.nz + (grid.gcz - 1);
+
+					Rhs = (
+						(cx[i] < p1_min) || (cx[i] > p1_max) ||
+						(cy[j] < p2_min) || (cy[j] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cx[i] - p1_min) * (p1_max - cx[i]) *
+						(cy[j] - p2_min) * (p2_max - cy[j]);
+
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx + 1] + c3 * X[idx + 2];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx + 1];
+#endif
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * top side * //
+	if (is_top) {
+
+		int i, j, idx;
+		T *cx = (node == nodeU) ? grid.ex : grid.px;
+		T *cy = (node == nodeV) ? grid.ey : grid.py;
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx, Rhs ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx, Rhs ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+
+					Rhs = (
+						(cx[i] < p1_min) || (cx[i] > p1_max) ||
+						(cy[j] < p2_min) || (cy[j] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cx[i] - p1_min) * (p1_max - cx[i]) *
+						(cy[j] - p2_min) * (p2_max - cy[j]);
+
+					X[idx] = Rhs;
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx, Rhs ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx, Rhs ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+
+					Rhs = (
+						(cx[i] < p1_min) || (cx[i] > p1_max) ||
+						(cy[j] < p2_min) || (cy[j] > p2_max)) ? (T) 0.0 :
+						p_value *
+						(cx[i] - p1_min) * (p1_max - cx[i]) *
+						(cy[j] - p2_min) * (p2_max - cy[j]);
+
+#ifdef USE_QUAD_EXTRAPOLATION_IN_BC
+					X[idx] = c1 * Rhs +
+						c2 * X[idx - 1] + c3 * X[idx - 2];
+#else
+					X[idx] = (T)2.0 * Rhs - X[idx - 1];
+#endif
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+#ifndef BLOCK_GHOST_ZONE_EXTRAPOLATION
+	ghost_extrapolation< T >(X, side, node,
+		grid.nx, grid.ny, grid.nz, grid.gcx, grid.gcy, grid.gcz, 1, 1, 1,
+		grid.mpi_com);
+#endif
+}
+// ------------------------------------------------------------------------ //
+
+
+// Neumann Type Boundary Conditions
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::neumann_bc(T* _RESTRICT X, const T Rhs,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+	const wstGrid3d< T >& grid)
+{
+	const T c1 = (T) 2.0 / (T) 3.0,
+		c2 = (T) 4.0 / (T) 3.0,
+		c3 = -(T) 1.0 / (T) 3.0;
+
+	const int is_west = (side == westSide) && (grid.mpi_com.rank_x == 0);
+	const int is_east = (side == eastSide) && (grid.mpi_com.rank_x == grid.mpi_com.size_x - 1);
+	const int is_south = (side == southSide) && (grid.mpi_com.rank_y == 0);
+	const int is_north = (side == northSide) && (grid.mpi_com.rank_y == grid.mpi_com.size_y - 1);
+	const int is_bottom = (side == bottomSide) && (grid.mpi_com.rank_z == 0);
+	const int is_top = (side == topSide) && (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1);
+
+	// * west side * //
+	if (is_west) {
+
+		int j, k, idx;
+		if (node == nodeU)	// staggered node //
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = grid.gcx * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = -c1 * Rhs * grid.dx +
+						c2 * X[idx + grid.nyz] + c3 * X[idx + (grid.nyz << 1)];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.gcx - 1) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = X[idx + grid.nyz] - grid.dx * Rhs;
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * east side * //
+	if (is_east) {
+
+		int j, k, idx;
+		if (node == nodeU)	// staggered node //
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = c1 * Rhs * grid.dx +
+						c2 * X[idx - grid.nyz] + c3 * X[idx - (grid.nyz << 1)];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = X[idx - grid.nyz] + grid.dx * Rhs;
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * south side * //
+	if (is_south) {
+
+		int i, k, idx;
+		if (node == nodeV)	// staggered node //
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = -c1 * Rhs * grid.dy +
+						c2 * X[idx + grid.nz] + c3 * X[idx + (grid.nz << 1)];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.gcy - 1) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = X[idx + grid.nz] - grid.dy * Rhs;
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * north side * //
+	if (is_north) {
+
+		int i, k, idx;
+		if (node == nodeV)	// staggered node //
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = c1 * Rhs * grid.dy +
+						c2 * X[idx - grid.nz] + c3 * X[idx - (grid.nz << 1)];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = X[idx - grid.nz] + grid.dy * Rhs;
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * bottom side * //
+	if (is_bottom) {
+
+		int i, j, idx;
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + grid.gcz;
+
+					X[idx] = -c1 * Rhs * grid.dz[grid.gcz] +
+						c2 * X[idx + 1] + c3 * X[idx + 2];
+				}
+		}
+		else
+		{
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.gcz - 1);
+
+					X[idx] = X[idx + 1] - grid.dz[grid.gcz - 1] * Rhs;
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * top side * //
+	if (is_top) {
+
+		int i, j, idx;
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+
+					X[idx] = c1 * Rhs * grid.dz[grid.nz - grid.gcz] +
+						c2 * X[idx - 1] + c3 * X[idx - 2];
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+
+					X[idx] = X[idx - 1] + grid.dz[grid.nz - grid.gcz] * Rhs;
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+#ifndef BLOCK_GHOST_ZONE_EXTRAPOLATION
+	ghost_extrapolation< T >(X, side, node,
+		grid.nx, grid.ny, grid.nz, grid.gcx, grid.gcy, grid.gcz, 1, 1, 1,
+		grid.mpi_com);
+#endif
+}
+// ------------------------------------------------------------------------ //
+
+// Convective Type Boundary Conditions
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::convective_bc(T* _RESTRICT X, const T* _RESTRICT const X_p, const T c_velocity,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	const int is_west = (side == westSide) && (grid.mpi_com.rank_x == 0);
+	const int is_east = (side == eastSide) && (grid.mpi_com.rank_x == grid.mpi_com.size_x - 1);
+	const int is_south = (side == southSide) && (grid.mpi_com.rank_y == 0);
+	const int is_north = (side == northSide) && (grid.mpi_com.rank_y == grid.mpi_com.size_y - 1);
+	const int is_bottom = (side == bottomSide) && (grid.mpi_com.rank_z == 0);
+	const int is_top = (side == topSide) && (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1);
+
+	// * west side * //
+	if (is_west) {
+
+		const T courant = c_velocity * dt / grid.dx;
+		int j, k, idx;
+		if (node == nodeU)	// staggered node //
+		{
+			const T del = (T) 1.0 / (
+				(T) 2.0 - (T) 3.0 * courant);
+
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = grid.gcx * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = ((T) 2.0 * X_p[idx] -
+						(T) 4.0 * courant * X[idx + grid.nyz] +
+						courant * X[idx + (grid.nyz << 1)]) * del;
+				}
+			}
+		}
+		else
+		{
+			const T del = (T) 1.0 / (
+				(T) 2.0 * courant - (T) 1.0);
+
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.gcx - 1) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = (((T) 1.0 + (T) 2.0 * courant) * X[idx + grid.nyz] -
+						X_p[idx] - X_p[idx + grid.nyz]) * del;
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * east side * //
+	if (is_east) {
+
+		const T courant = c_velocity * dt / grid.dx;
+		int j, k, idx;
+		if (node == nodeU)	// staggered node //
+		{
+			const T del = (T) 1.0 / (
+				(T) 2.0 + (T) 3.0 * courant);
+
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = ((T) 2.0 * X_p[idx] +
+						(T) 4.0 * courant * X[idx - grid.nyz] -
+						courant * X[idx - (grid.nyz << 1)]) * del;
+				}
+			}
+		}
+		else
+		{
+			const T del = (T) 1.0 / (
+				(T) 1.0 + (T) 2.0 * courant);
+
+#pragma omp parallel for private( j, k, idx ) shared( X )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = (((T) 2.0 * courant - (T) 1.0) * X[idx - grid.nyz] +
+						X_p[idx] + X_p[idx - grid.nyz]) * del;
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * south side * //
+	if (is_south) {
+
+		const T courant = c_velocity * dt / grid.dy;
+		int i, k, idx;
+		if (node == nodeV)	// staggered node //
+		{
+			const T del = (T) 1.0 / (
+				(T) 2.0 - (T) 3.0 * courant);
+
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = ((T) 2.0 * X_p[idx] -
+						(T) 4.0 * courant * X[idx + grid.nz] +
+						courant * X[idx + (grid.nz << 1)]) * del;
+				}
+			}
+		}
+		else
+		{
+			const T del = (T) 1.0 / (
+				(T) 2.0 * courant - (T) 1.0);
+
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.gcy - 1) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = (((T) 1.0 + (T) 2.0 * courant) * X[idx + grid.nz] -
+						X_p[idx] - X_p[idx + grid.nz]) * del;
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * north side * //
+	if (is_north) {
+
+		const T courant = c_velocity * dt / grid.dy;
+		int i, k, idx;
+		if (node == nodeV)	// staggered node //
+		{
+			const T del = (T) 1.0 / (
+				(T) 2.0 + (T) 3.0 * courant);
+
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = ((T) 2.0 * X_p[idx] +
+						(T) 4.0 * courant * X[idx - grid.nz] -
+						courant * X[idx - (grid.nz << 1)]) * del;
+				}
+			}
+		}
+		else
+		{
+			const T del = (T) 1.0 / (
+				(T) 1.0 + (T) 2.0 * courant);
+
+#pragma omp parallel for private( i, k, idx ) shared( X )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					X[idx] = (((T) 2.0 * courant - (T) 1.0) * X[idx - grid.nz] +
+						X_p[idx] + X_p[idx - grid.nz]) * del;
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * bottom side * //
+	if (is_bottom) {
+
+		const T courant = c_velocity * dt / grid.dz[grid.gcz - 1];
+		int i, j, idx;
+		if (node == nodeW)	// staggered node //
+		{
+			const T del = (T) 1.0 / (
+				(T) 2.0 - (T) 3.0 * courant);
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + grid.gcz;
+
+					X[idx] = ((T) 2.0 * X_p[idx] -
+						(T) 4.0 * courant * X[idx + 1] +
+						courant * X[idx + 2]) * del;
+				}
+		}
+		else
+		{
+			const T del = (T) 1.0 / (
+				(T) 2.0 * courant - (T) 1.0);
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.gcz - 1);
+
+					X[idx] = (((T) 1.0 + (T) 2.0 * courant) * X[idx + 1] -
+						X_p[idx] - X_p[idx + 1]) * del;
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * top side * //
+	if (is_top) {
+
+		const T courant = c_velocity * dt / grid.dz[grid.nz - grid.gcz];
+		int i, j, idx;
+		if (node == nodeW)	// staggered node //
+		{
+			const T del = (T) 1.0 / (
+				(T) 2.0 + (T) 3.0 * courant);
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+
+					X[idx] = ((T) 2.0 * X_p[idx] +
+						(T) 4.0 * courant * X[idx - 1] -
+						courant * X[idx - 2]) * del;
+				}
+		}
+		else
+		{
+			const T del = (T) 1.0 / (
+				(T) 1.0 + (T) 2.0 * courant);
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) shared( X )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+
+					X[idx] = (((T) 2.0 * courant - (T) 1.0) * X[idx - 1] +
+						X_p[idx] + X_p[idx - 1]) * del;
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+#ifndef BLOCK_GHOST_ZONE_EXTRAPOLATION
+	ghost_extrapolation< T >(X, side, node,
+		grid.nx, grid.ny, grid.nz, grid.gcx, grid.gcy, grid.gcz, 1, 1, 1,
+		grid.mpi_com);
+#endif
+}
+// ------------------------------------------------------------------------ //
+
+// External boundary average
+// ------------------------------------------------------------------------ //
+template< typename T >
+T nse::external_average(const T* _RESTRICT const X,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+	const wstGrid3d< T >& grid)
+{
+	const T c1 = (T) 15.0 / (T) 8.0,
+		c2 = -(T) 5.0 / (T) 4.0,
+		c3 = (T) 3.0 / (T) 8.0;
+
+	T avg = (T) 0.0;
+
+	const int is_west = (side == westSide) && (grid.mpi_com.rank_x == 0);
+	const int is_east = (side == eastSide) && (grid.mpi_com.rank_x == grid.mpi_com.size_x - 1);
+	const int is_south = (side == southSide) && (grid.mpi_com.rank_y == 0);
+	const int is_north = (side == northSide) && (grid.mpi_com.rank_y == grid.mpi_com.size_y - 1);
+	const int is_bottom = (side == bottomSide) && (grid.mpi_com.rank_z == 0);
+	const int is_top = (side == topSide) && (grid.mpi_com.rank_z == grid.mpi_com.size_z - 1);
+
+	// * west side * //
+	if (is_west) {
+
+		int j, k, idx;
+		if (node == nodeU)	// staggered node //
+		{
+#pragma omp parallel for private( j, k, idx ) reduction( + : avg )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.gcx - 1) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				{
+					avg += ((T) 3.0 * (X[idx + grid.nyz] - X[idx + (grid.nyz << 1)])
+						+ X[idx + (grid.nyz << 1) + grid.nyz]) * grid.dy * grid.dz[k];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( j, k, idx ) reduction( + : avg )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.gcx - 1) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				{
+					avg += (c1 * X[idx + grid.nyz] +
+						c2 * X[idx + (grid.nyz << 1)] +
+						c3 * X[idx + (grid.nyz << 1) + grid.nyz]) * grid.dy * grid.dz[k];
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * east side * //
+	if (is_east) {
+
+		int j, k, idx;
+		if (node == nodeU)	// staggered node //
+		{
+#pragma omp parallel for private( j, k, idx ) reduction( + : avg )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				{
+					avg += ((T) 3.0 * (X[idx - grid.nyz] - X[idx - (grid.nyz << 1)])
+						+ X[idx - (grid.nyz << 1) - grid.nyz]) * grid.dy * grid.dz[k];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( j, k, idx ) reduction( + : avg )
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = (grid.nx - grid.gcx) * grid.nyz + j * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				{
+					avg += (c1 * X[idx - grid.nyz] +
+						c2 * X[idx - (grid.nyz << 1)] +
+						c3 * X[idx - (grid.nyz << 1) - grid.nyz]) * grid.dy * grid.dz[k];
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * south side * //
+	if (is_south) {
+
+		int i, k, idx;
+		if (node == nodeV)	// staggered node //
+		{
+#pragma omp parallel for private( i, k, idx ) reduction( + : avg )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.gcy - 1) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				{
+					avg += ((T) 3.0 * (X[idx + grid.nz] - X[idx + (grid.nz << 1)])
+						+ X[idx + (grid.nz << 1) + grid.nz]) * grid.dx * grid.dz[k];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( i, k, idx ) reduction( + : avg )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.gcy - 1) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				{
+					avg += (c1 * X[idx + grid.nz] +
+						c2 * X[idx + (grid.nz << 1)] +
+						c3 * X[idx + (grid.nz << 1) + grid.nz]) * grid.dx * grid.dz[k];
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * north side * //
+	if (is_north) {
+
+		int i, k, idx;
+		if (node == nodeV)	// staggered node //
+		{
+#pragma omp parallel for private( i, k, idx ) reduction( + : avg )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				{
+					avg += ((T) 3.0 * (X[idx - grid.nz] - X[idx - (grid.nz << 1)])
+						+ X[idx - (grid.nz << 1) - grid.nz]) * grid.dx * grid.dz[k];
+				}
+			}
+		}
+		else
+		{
+#pragma omp parallel for private( i, k, idx ) reduction( + : avg )
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+			{
+				idx = i * grid.nyz + (grid.ny - grid.gcy) * grid.nz + grid.gcz;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				{
+					avg += (c1 * X[idx - grid.nz] +
+						c2 * X[idx - (grid.nz << 1)] +
+						c3 * X[idx - (grid.nz << 1) - grid.nz]) * grid.dx * grid.dz[k];
+				}
+			}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * bottom side * //
+	if (is_bottom) {
+
+		int i, j, idx;
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) reduction( + : avg ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) reduction( + : avg )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.gcz - 1);
+
+					avg += ((T) 3.0 * (X[idx + 1] - X[idx + 2])
+						+ X[idx + 3]) * grid.dx * grid.dy;
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) reduction( + : avg ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) reduction( + : avg )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.gcz - 1);
+
+					avg += (c1 * X[idx + 1] +
+						c2 * X[idx + 2] +
+						c3 * X[idx + 3]) * grid.dx * grid.dy;
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * top side * //
+	if (is_top) {
+
+		int i, j, idx;
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) reduction( + : avg ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) reduction( + : avg )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+
+					avg += ((T) 3.0 * (X[idx - 1] - X[idx - 2])
+						+ X[idx - 3]) * grid.dx * grid.dy;
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, idx ) reduction( + : avg ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, idx ) reduction( + : avg )
+#endif
+			for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+				for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+					idx = i * grid.nyz + j * grid.nz + (grid.nz - grid.gcz);
+
+					avg += (c1 * X[idx - 1] +
+						c2 * X[idx - 2] +
+						c3 * X[idx - 3]) * grid.dx * grid.dy;
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+
+	// reduction for average boundary conditions
+	avg = mpi_allreduce(avg, MPI_SUM, grid.mpi_com.comm);
+	if ((side == westSide) || (side == eastSide))
+		avg /= (T) (grid.mpi_width * grid.mpi_height);
+	if ((side == southSide) || (side == northSide))
+		avg /= (T) (grid.mpi_length * grid.mpi_height);
+	if ((side == bottomSide) || (side == topSide))
+		avg /= (T) (grid.mpi_length * grid.mpi_width);
+
+	return avg;
+}
+// ------------------------------------------------------------------------ //
+
+
+#ifndef BLOCK_GHOST_ZONE_EXTRAPOLATION
+// Ghost nodes extrapolation
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::ghost_extrapolation(T* _RESTRICT X,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int gcx_sh, const int gcy_sh, const int gcz_sh,
+	const mpiCom3d& mpi_com)
+{
+	int i, j, k, idx;
+	const int nyz = ny * nz;
+
+	const int is_west = (side == westSide) && (mpi_com.rank_x == 0);
+	const int is_east = (side == eastSide) && (mpi_com.rank_x == mpi_com.size_x - 1);
+	const int is_south = (side == southSide) && (mpi_com.rank_y == 0);
+	const int is_north = (side == northSide) && (mpi_com.rank_y == mpi_com.size_y - 1);
+	const int is_bottom = (side == bottomSide) && (mpi_com.rank_z == 0);
+	const int is_top = (side == topSide) && (mpi_com.rank_z == mpi_com.size_z - 1);
+
+	// * west side * //
+	if (is_west) {
+		if (node == nodeU)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx_sh; i < gcx; i++)
+				for (j = gcy; j < ny - gcy; j++) {
+					idx = (gcx - i) * nyz + j * nz + gcz;
+					for (k = gcz; k < nz - gcz; k++, idx++) {
+						X[idx] = (T) 3.0 * (X[idx + nyz] - X[idx + (nyz << 1)])
+							+ X[idx + (nyz << 1) + nyz];
+					}
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx_sh; i < gcx; i++)
+				for (j = gcy; j < ny - gcy; j++) {
+					idx = (gcx - i - 1) * nyz + j * nz + gcz;
+					for (k = gcz; k < nz - gcz; k++, idx++) {
+						X[idx] =
+							(T) 3.0 * (X[idx + nyz] - X[idx + (nyz << 1)])
+							+ X[idx + (nyz << 1) + nyz];
+					}
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * east side * //
+	if (is_east) {
+		if (node == nodeU)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx_sh; i < gcx; i++)
+				for (j = gcy; j < ny - gcy; j++) {
+					idx = (nx - gcx + i) * nyz + j * nz + gcz;
+					for (k = gcz; k < nz - gcz; k++, idx++) {
+						X[idx] = (T) 3.0 * (X[idx - nyz] - X[idx - (nyz << 1)])
+							+ X[idx - (nyz << 1) - nyz];
+					}
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx_sh; i < gcx; i++)
+				for (j = gcy; j < ny - gcy; j++) {
+					idx = (nx - gcx + i) * nyz + j * nz + gcz;
+					for (k = gcz; k < nz - gcz; k++, idx++) {
+						X[idx] =
+							(T) 3.0 * (X[idx - nyz] - X[idx - (nyz << 1)])
+							+ X[idx - (nyz << 1) - nyz];
+					}
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * south side * //
+	if (is_south) {
+		if (node == nodeV)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx; i < nx - gcx; i++)
+				for (j = gcy_sh; j < gcy; j++) {
+					idx = i * nyz + (gcy - j) * nz + gcz;
+					for (k = gcz; k < nz - gcz; k++, idx++) {
+						X[idx] =
+							(T) 3.0 * (X[idx + nz] - X[idx + (nz << 1)])
+							+ X[idx + (nz << 1) + nz];
+					}
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx; i < nx - gcx; i++)
+				for (j = gcy_sh; j < gcy; j++) {
+					idx = i * nyz + (gcy - j - 1) * nz + gcz;
+					for (k = gcz; k < nz - gcz; k++, idx++) {
+						X[idx] =
+							(T) 3.0 * (X[idx + nz] - X[idx + (nz << 1)])
+							+ X[idx + (nz << 1) + nz];
+					}
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * north side * //
+	if (is_north)
+	{
+		if (node == nodeV)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx; i < nx - gcx; i++)
+				for (j = gcy_sh; j < gcy; j++) {
+					idx = i * nyz + (ny - gcy + j) * nz + gcz;
+					for (k = gcz; k < nz - gcz; k++, idx++) {
+						X[idx] =
+							(T) 3.0 * (X[idx - nz] - X[idx - (nz << 1)])
+							+ X[idx - (nz << 1) - nz];
+					}
+				}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx; i < nx - gcx; i++)
+				for (j = gcy_sh; j < gcy; j++) {
+					idx = i * nyz + (ny - gcy + j) * nz + gcz;
+					for (k = gcz; k < nz - gcz; k++, idx++) {
+						X[idx] =
+							(T) 3.0 * (X[idx - nz] - X[idx - (nz << 1)])
+							+ X[idx - (nz << 1) - nz];
+					}
+				}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * bottom side * //
+	if (is_bottom) {
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx; i < nx - gcx; i++)
+				for (j = gcy; j < ny - gcy; j++)
+					for (k = gcz_sh; k < gcz; k++) {
+						idx = i * nyz + j * nz + (gcz - k);
+
+						X[idx] =
+							(T) 3.0 * (X[idx + 1] - X[idx + 2])
+							+ X[idx + 3];
+					}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx; i < nx - gcx; i++)
+				for (j = gcy; j < ny - gcy; j++)
+					for (k = gcz_sh; k < gcz; k++) {
+						idx = i * nyz + j * nz + (gcz - k - 1);
+
+						X[idx] =
+							(T) 3.0 * (X[idx + 1] - X[idx + 2])
+							+ X[idx + 3];
+					}
+		}
+	}
+	// --------------------------------------------------------------------- //
+
+	// * top side * //
+	if (is_top)
+	{
+		if (node == nodeW)	// staggered node //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx; i < nx - gcx; i++)
+				for (j = gcy; j < ny - gcy; j++)
+					for (k = gcz_sh; k < gcz; k++) {
+						idx = i * nyz + j * nz + (nz - gcz + k);
+
+						X[idx] =
+							(T) 3.0 * (X[idx - 1] - X[idx - 2])
+							+ X[idx - 3];
+
+					}
+		}
+		else
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, idx ) shared( X )
+#endif
+			for (i = gcx; i < nx - gcx; i++)
+				for (j = gcy; j < ny - gcy; j++)
+					for (k = gcz_sh; k < gcz; k++) {
+						idx = i * nyz + j * nz + (nz - gcz + k);
+
+						X[idx] =
+							(T) 3.0 * (X[idx - 1] - X[idx - 2])
+							+ X[idx - 3];
+					}
+		}
+	}
+	// --------------------------------------------------------------------- //
+}
+// ------------------------------------------------------------------------ //
+#endif
+
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+
+// * initialize: dirichlet boundary conditions * //
+template void nse::dirichlet_bc(float* _RESTRICT X, const float rhs,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node, const wstGrid3d< float >& grid);
+template void nse::dirichlet_bc(double* _RESTRICT X, const double rhs,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node, const wstGrid3d< double >& grid);
+
+template void nse::dirichlet_bc(float* _RESTRICT X, const float* _RESTRICT rhs,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node, const wstGrid3d< float >& grid);
+template void nse::dirichlet_bc(double* _RESTRICT X, const double* _RESTRICT rhs,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node, const wstGrid3d< double >& grid);
+
+template void nse::dirichlet_bc(float* _RESTRICT X, const float x_max,
+	const float p1_min, const float p1_max,
+	const float p2_min, const float p2_max,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node, const wstGrid3d< float >& grid);
+template void nse::dirichlet_bc(double* _RESTRICT X, const double x_max,
+	const double p1_min, const double p1_max,
+	const double p2_min, const double p2_max,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node, const wstGrid3d< double >& grid);
+// ---------------------------------------------------------------------  //
+
+// * initialize: neumann boundary conditions * //
+template void nse::neumann_bc(float* _RESTRICT X, const float rhs,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node, const wstGrid3d< float >& grid);
+template void nse::neumann_bc(double* _RESTRICT X, const double rhs,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node, const wstGrid3d< double >& grid);
+// ---------------------------------------------------------------------  //
+
+// * initialize: convective boundary conditions * //
+template void nse::convective_bc(float* _RESTRICT X,
+	const float* _RESTRICT const X_p, const float c_velocity,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::convective_bc(double* _RESTRICT X,
+	const double* _RESTRICT const X_p, const double c_velocity,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+	const wstGrid3d< double >& grid, const double dt);
+// ---------------------------------------------------------------------  //
+
+// * initialize: domain boundary average * //
+template float nse::external_average(const float* _RESTRICT const X,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node, const wstGrid3d< float >& grid);
+template double nse::external_average(const double* _RESTRICT const X,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node, const wstGrid3d< double >& grid);
+// ---------------------------------------------------------------------  //
+
+#ifndef BLOCK_GHOST_ZONE_EXTRAPOLATION
+// * initialize: ghost nodes extrapolation * //
+template void nse::ghost_extrapolation(float* _RESTRICT X,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int gcx_sh, const int gcy_sh, const int gcz_sh,
+	const mpiCom3d& mpi_com);
+template void nse::ghost_extrapolation(double* _RESTRICT X,
+	const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int gcx_sh, const int gcy_sh, const int gcz_sh,
+	const mpiCom3d& mpi_com);
+// ---------------------------------------------------------------------  //
+#endif
diff --git a/nse-bc3d.h b/nse-bc3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..b422c8f23f0592ca175c583543a3d6423eb902e9
--- /dev/null
+++ b/nse-bc3d.h
@@ -0,0 +1,79 @@
+#pragma once
+
+// [nse-bc3d.h(cpp)]: 3D Navier-Stokes boundary conditions module
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "wstgrid3d.h"
+
+#define BLOCK_GHOST_ZONE_EXTRAPOLATION		// block extrapolation to wide ghost zone [ng > 1]
+//#define USE_QUAD_EXTRAPOLATION_IN_BC		// use quadratic extrapolation for dirichlet b.c.'s
+
+
+namespace nse
+{
+	// * dirichlet boundary condition for [U,V,W,C] grid nodes * //
+	template< typename T >
+	void dirichlet_bc(T* _RESTRICT X, const T rhs,
+		const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+		const wstGrid3d< T >& grid);
+	// ---------------------------------------------------------------------  //
+
+	// * dirichlet boundary condition for [U,V,W,C] grid nodes * //
+	template< typename T >
+	void dirichlet_bc(T* _RESTRICT X, const T* _RESTRICT rhs,
+		const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+		const wstGrid3d< T >& grid);
+	// ---------------------------------------------------------------------  //
+
+	// * dirichlet boundary condition for [U,V,W,C] grid nodes * //
+	// *     F( p ) = 16 * F_{max} * 
+	//                            ( p1 - p1_{min} ) * ( p1_{max} - p1 ) *
+	//                            ( p2 - p2_{min} ) * ( p2_{max} - p2 ) / ( L1^{2} * L2^{2} )
+	//                           , where L1 = ( p1_{max} - p1_{min} )
+	//                                   L2 = ( p2_{max} - p2_{min} )
+	template< typename T >
+	void dirichlet_bc(T* _RESTRICT X, const T x_max,
+		const T p1_min, const T p1_max, const T p2_min, const T p2_max,
+		const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+		const wstGrid3d< T >& grid);
+	// ---------------------------------------------------------------------  //
+
+
+	// * neumann boundary condition for [U,V,W,C] grid nodes * //
+	template< typename T >
+	void neumann_bc(T* _RESTRICT X, const T rhs,
+		const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+		const wstGrid3d< T >& grid);
+	// ---------------------------------------------------------------------  //
+
+
+	// * convective boundary condition for [U,V,W,C] grid nodes * //
+	template< typename T >
+	void convective_bc(T* _RESTRICT X, const T* _RESTRICT const X_p, const T c_velocity,
+		const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+		const wstGrid3d< T >& grid, const T dt);
+	// ---------------------------------------------------------------------  //
+
+
+	// * domain boundary average for [U,V,W,C] grid nodes * //
+	template< typename T >
+	T external_average(const T* _RESTRICT const X,
+		const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+		const wstGrid3d< T >& grid);
+	// ---------------------------------------------------------------------  //
+
+
+#ifndef BLOCK_GHOST_ZONE_EXTRAPOLATION
+	// * ghost cell extrapolation for [U,V,W,C] grid nodes * //
+	template< typename T >
+	void ghost_extrapolation(T* _RESTRICT X,
+		const nse_const3d::domainSideType side, const nse_const3d::nodeType node,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+		const int gcx_sh, const int gcy_sh, const int gcz_sh,
+		const mpiCom3d& mpi_com);
+	// ---------------------------------------------------------------------  //
+#endif
+}
diff --git a/nse-dump.h b/nse-dump.h
new file mode 100644
index 0000000000000000000000000000000000000000..14d4648bc4ca1cee1686d17795d82b1aa38cbfa3
--- /dev/null
+++ b/nse-dump.h
@@ -0,0 +1,112 @@
+#pragma once
+
+// [nse-dump.h]: dump data structure for main flow fields
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+#include "str-com.h"
+
+#include <string>
+
+template< typename T >
+struct nseDump {
+
+	std::string DIR;
+
+	std::string DATA_FILE;
+	std::string NSE_STAMP_FILE;
+
+	std::string NSE_SEQ_FILE;
+
+	std::string VELOCITY_FILE;
+	std::string VELOCITY_IMP_FILE;
+#if (AB_TYPE==3)
+	std::string VELOCITY_IMPP_FILE;
+#endif
+	std::string PRESSURE_FILE, PHI_PRESSURE_FILE;
+
+#ifdef STRATIFICATION
+	std::string TEMPERATURE_FILE;
+	std::string TEMPERATURE_IMP_FILE;
+#if (AB_TYPE==3)
+	std::string TEMPERATURE_IMPP_FILE;
+#endif
+#endif
+
+#ifdef INCLUDE_PARTICLES
+	std::string PTCL_FILE;
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+	std::string PTCL_TRACK_FILE;
+	std::string PTCL_TRAJ_SUBDIR, PTCL_TRAJ_FILE;
+#endif
+
+	T begin, dt;
+
+	int index;
+	T mark;
+
+	bool restart;			// restart from dump flag, default: [false]
+	int restart_index;		// restart dump index
+
+	bool edge_mode;		// edge mode dump, default: [false]
+	double max_run_time;
+
+	nseDump();
+	~nseDump();
+
+	bool set_filenames(const std::string& _DIR);
+};
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+inline nseDump< T >::nseDump() : restart(false), edge_mode(false) {}
+template< typename T >
+inline nseDump< T >::~nseDump() {}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+inline bool nseDump< T >::set_filenames(const std::string& _DIR)
+{
+	if (!nse::create_dir(_DIR)) return false;
+
+	DIR = _DIR;
+
+	DATA_FILE = DIR + "nse-data-.txt";
+	NSE_STAMP_FILE = DIR + "nse-stamp-.dsq";
+	NSE_SEQ_FILE = DIR + "nse-.dsq";
+
+	VELOCITY_FILE = DIR + "velocity-dump-.nsx";
+	VELOCITY_IMP_FILE = DIR + "velocity-imp-dump-.nsx";
+#if (AB_TYPE==3)
+	VELOCITY_IMPP_FILE = DIR + "velocity-impp-dump-.nsx";
+#endif
+	PRESSURE_FILE = DIR + "pressure-dump-.nsx";
+	PHI_PRESSURE_FILE = DIR + "phi-pressure-dump-.nsx";
+
+#ifdef STRATIFICATION
+	TEMPERATURE_FILE = DIR + "temperature-dump-.nsx";
+	TEMPERATURE_IMP_FILE = DIR + "temperature-imp-dump-.nsx";
+#if (AB_TYPE==3)
+	TEMPERATURE_IMPP_FILE = DIR + "temperature-impp-dump-.nsx";
+#endif
+#endif
+
+#ifdef INCLUDE_PARTICLES
+	PTCL_FILE = DIR + "particles-dump-.nsx";
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+	PTCL_TRACK_FILE = DIR + "particles-track-dump-.nsx";
+
+	PTCL_TRAJ_SUBDIR = "traj/";
+	if (!nse::create_dir(DIR + PTCL_TRAJ_SUBDIR)) return false;
+
+	PTCL_TRAJ_FILE = DIR + PTCL_TRAJ_SUBDIR + "traj-dump-.psx";
+#endif
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/nse-fops3d-x2.cpp b/nse-fops3d-x2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60eaff487217809631602a5ad313276b7e33c132
--- /dev/null
+++ b/nse-fops3d-x2.cpp
@@ -0,0 +1,2505 @@
+#include "nse-fops3d-x2.h"
+
+using namespace nse::nse_const3d;
+
+
+// * node-add(sub) routines * //
+// ------------------------------------------------------------------------ //
+template< typename T > // X = X + Q [Q = U, V, W, C]
+void nse::c_add(T* _RESTRICT X, const T* _RESTRICT const Q,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeU) ? grid.nyz :
+		(node == nodeV) ? grid.nz :
+		(node == nodeW) ? 1 :
+		0;	// default: [nodeC]
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				X[idx] += (T) 0.5 * (Q[idx] + Q[idx + opidx]);
+		}
+	}
+}
+
+template< typename T > // X = X - Q [Q = U, V, W, C]
+void nse::c_sub(T* _RESTRICT X, const T* _RESTRICT const Q,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeU) ? grid.nyz :
+		(node == nodeV) ? grid.nz :
+		(node == nodeW) ? 1 :
+		0;	// default: [nodeC]
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				X[idx] -= (T) 0.5 * (Q[idx] + Q[idx + opidx]);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * field products * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// = U * U [node: C, U, UV, UW]
+void nse::u_square(T* _RESTRICT U2, const T* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeC) ? grid.nyz :
+		(node == nodeUV) ? -grid.nz :
+		(node == nodeUW) ? -1 :
+		0;	// default: [nodeU]
+
+	const int ish = ((node == nodeU) || (node == nodeUV) || (node == nodeUW)) ? 1 : 0;
+	const int ksh = (node == nodeUW) ? 1 : 0;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( U2 ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx + ish; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( U2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx + ish; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz + ksh; k++, idx++)
+				U2[idx] = U[idx + opidx] * U[idx];
+		}
+	}
+}
+
+template< typename T >	// = V * V [node: C, V, UV, VW]
+void nse::v_square(T* _RESTRICT V2, const T* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeC) ? grid.nz :
+		(node == nodeUV) ? -grid.nyz :
+		(node == nodeVW) ? -1 :
+		0;	// default: [nodeV]
+
+	const int jsh = ((node == nodeV) || (node == nodeUV) || (node == nodeVW)) ? 1 : 0;
+	const int ksh = (node == nodeVW) ? 1 : 0;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( V2 ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy + jsh; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( V2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy + jsh; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz + ksh; k++, idx++)
+				V2[idx] = V[idx + opidx] * V[idx];
+		}
+	}
+}
+
+template< typename T >	// = W * W [node: C, W, UW, VW]
+void nse::w_square(T* _RESTRICT W2, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if ((node == nodeC) || (node == nodeW)) {
+
+		const int opidx = (node == nodeC) ? 1 :
+			0;	// default: [nodeW]
+
+		const int ksh = (node == nodeW) ? 1 : 0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2 ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz + ksh; k++, idx++)
+					W2[idx] = W[idx + opidx] * W[idx];
+			}
+		}
+
+		return;
+	}
+
+	if (node == nodeU) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2 ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2[idx] = (T)0.25 * 
+					(W[idx] + W[idx - grid.nyz]) * 
+					(W[idx + 1] + W[idx - grid.nyz + 1]);
+			}
+		}
+
+		return;
+	}
+
+	if (node == nodeV) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2 ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2[idx] = (T)0.25 *
+					(W[idx] + W[idx - grid.nz]) *
+					(W[idx + 1] + W[idx - grid.nz + 1]);
+			}
+		}
+
+		return;
+	}
+
+	if (node == nodeUW) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2 ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2[idx] = (T)0.25 *
+					(W[idx] + W[idx - grid.nyz]) * 
+					(W[idx] + W[idx - grid.nyz]);
+			}
+		}
+
+		return;
+	}
+
+	if (node == nodeVW) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2 ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2[idx] = (T)0.25 *
+					(W[idx] + W[idx - grid.nz]) *
+					(W[idx] + W[idx - grid.nz]);
+			}
+		}
+
+		return;
+	}
+}
+
+template< typename T >	// = C * C [node: C, U, V, W]
+void nse::c_square(T* _RESTRICT C2, const T* _RESTRICT const C,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeU) ? -grid.nyz :
+		(node == nodeV) ? -grid.nz :
+		(node == nodeW) ? -1 :
+		0;	// default: [nodeC]
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C2 ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				C2[idx] = C[idx + opidx] * C[idx];
+		}
+	}
+}
+
+template< typename T >	// = Ua * Ub [node: C, U, UV, UW]
+void nse::uu_product(T* _RESTRICT UU,
+	const T* _RESTRICT const Ua, const T* _RESTRICT const Ub,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeC) ? grid.nyz :
+		(node == nodeUV) ? -grid.nz :
+		(node == nodeUW) ? -1 :
+		0;	// default: [nodeU]
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeU) {
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UU ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UU )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UU[idx] = Ua[idx] * Ub[idx];
+			}
+		}
+		return;
+	}
+
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UU ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UU )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				UU[idx] = (T) 0.5 * (
+				Ua[idx + opidx] * Ub[idx] + Ua[idx] * Ub[idx + opidx]);
+		}
+	}
+}
+
+template< typename T >	// = Va * Vb [node: C, V, UV, VW]
+void nse::vv_product(T* _RESTRICT VV,
+	const T* _RESTRICT const Va, const T* _RESTRICT const Vb,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeC) ? grid.nz :
+		(node == nodeUV) ? -grid.nyz :
+		(node == nodeVW) ? -1 :
+		0;	// default: [nodeV]
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeV) {
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					VV[idx] = Va[idx] * Vb[idx];
+			}
+		}
+		return;
+	}
+
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VV ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VV )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				VV[idx] = (T) 0.5 * (
+				Va[idx + opidx] * Vb[idx] + Va[idx] * Vb[idx + opidx]);
+		}
+	}
+}
+
+template< typename T >	// = Wa * Wb [node: C, W, UW, VW]
+void nse::ww_product(T* _RESTRICT WW,
+	const T* _RESTRICT const Wa, const T* _RESTRICT const Wb,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeC) ? 1 :
+		(node == nodeUW) ? -grid.nyz :
+		(node == nodeVW) ? -grid.nz :
+		0;	// default: [nodeW]
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeW) {
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( WW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( WW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					WW[idx] = Wa[idx] * Wb[idx];
+			}
+		}
+		return;
+	}
+
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( WW ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( WW )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				WW[idx] = (T) 0.5 * (
+				Wa[idx + opidx] * Wb[idx] + Wa[idx] * Wb[idx + opidx]);
+		}
+	}
+}
+
+template< typename T >	// = Ca * Cb [node: C, U, V, W]
+void nse::cc_product(T* _RESTRICT CC,
+	const T* _RESTRICT const Ca, const T* _RESTRICT const Cb,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeU) ? -grid.nyz :
+		(node == nodeV) ? -grid.nz :
+		(node == nodeW) ? -1 :
+		0;	// default: [nodeC]
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CC ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CC )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CC[idx] = Ca[idx] * Cb[idx];
+			}
+		}
+		return;
+	}
+
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CC ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CC )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				CC[idx] = (T) 0.5 * (
+				Ca[idx + opidx] * Cb[idx] + Ca[idx] * Cb[idx + opidx]);
+		}
+	}
+}
+
+template< typename T >	// = U * V [node: C, UV, UVW]
+void nse::uv_product(T* _RESTRICT UV,
+	const T* _RESTRICT const U, const T* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.25 * (U[ijk] + U[i+1jk]) * (V[ijk] + V[ij+1k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UV[idx] = (T) 0.25 *
+					(U[idx] + U[idx + grid.nyz]) * (V[idx] + V[idx + grid.nz]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUV) {	// 0.25 * (U[ijk] + U[ij-1k]) * (V[ijk] + V[i-1jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UV[idx] = (T) 0.25 *
+					(U[idx] + U[idx - grid.nz]) * (V[idx] + V[idx - grid.nyz]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUVW) {	// 0.25 * (U[ijk] + U[ij-1k] + U[ijk-1] + U[ij-1k-1]) * 
+							// 0.25 * (V[ijk] + V[i-1jk] + V[ijk-1] + V[i-1jk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UV[idx] = (T) 0.25 * (T)0.25 *
+					(U[idx] + U[idx - grid.nz] + U[idx - 1] + U[idx - grid.nz - 1]) * 
+					(V[idx] + V[idx - grid.nyz] + V[idx - 1] + V[idx - grid.nyz - 1]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = U * W [node: C, UW, UVW]
+void nse::uw_product(T* _RESTRICT UW,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.25 * (U[ijk] + U[i+1jk]) * (W[ijk] + W[ijk+1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UW[idx] = (T) 0.25 *
+					(U[idx] + U[idx + grid.nyz]) * (W[idx] + W[idx + 1]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUW) {	// 0.25 * (U[ijk] + U[ijk-1]) * (W[ijk] + W[i-1jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UW[idx] = (T) 0.25 *
+					(U[idx] + U[idx - 1]) * (W[idx] + W[idx - grid.nyz]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUVW) {	// 0.25 * (U[ijk] + U[ijk-1] + U[ij-1k] + U[ij-1k-1]) * 
+							// 0.25 * (W[ijk] + W[i-1jk] + W[ij-1k] + W[i-1j-1k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UW[idx] = (T)0.25 * (T)0.25 *
+					(U[idx] + U[idx - 1] + U[idx - grid.nz] + U[idx - grid.nz - 1]) * 
+					(W[idx] + W[idx - grid.nyz] + W[idx - grid.nz] + W[idx - grid.nyz - grid.nz]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = V * W [node: C, VW, UVW]
+void nse::vw_product(T* _RESTRICT VW,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.25 * (V[ijk] + V[ij+1k]) * (W[ijk] + W[ijk+1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					VW[idx] = (T) 0.25 *
+					(V[idx] + V[idx + grid.nz]) * (W[idx] + W[idx + 1]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeVW) {	// 0.25 * (V[ijk] + V[ijk-1]) * (W[ijk] + W[ij-1k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					VW[idx] = (T) 0.25 *
+					(V[idx] + V[idx - 1]) * (W[idx] + W[idx - grid.nz]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUVW) {	// 0.25 * (V[ijk] + V[ijk-1] + V[i-1jk] + V[i-1jk-1]) * 
+							// 0.25 * (W[ijk] + W[ij-1k] + W[i-1jk] + W[i-1j-1k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					VW[idx] = (T)0.25 * (T)0.25 *
+					(V[idx] + V[idx - 1] + V[idx - grid.nyz] + V[idx - grid.nyz - 1]) * 
+					(W[idx] + W[idx - grid.nz] + W[idx - grid.nyz] + W[idx - grid.nyz - grid.nz]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T > // = C * U [node: C, U, UW]
+void nse::cu_product(T* _RESTRICT CU,
+	const T* _RESTRICT const C, const T* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.5 * C[ijk] * (U[ijk] + U[i+1jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CU ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CU )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CU[idx] = (T) 0.5 * C[idx] * (U[idx] + U[idx + grid.nyz]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeU) {	// 0.5 * U[ijk] * (C[ijk] + C[i-1jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CU ) collapse( 2 )
+		for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CU )
+		for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CU[idx] = (T) 0.5 * U[idx] * (C[idx] + C[idx - grid.nyz]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUW) {	// 0.125 * (U[ijk] + U[ijk-1]) * (C[ijk] + C[i-1jk] + C[ijk-1] + C[i-1jk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CU ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CU )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CU[idx] = (T) 0.125 * (U[idx] + U[idx - 1]) *
+					(C[idx] + C[idx - grid.nyz] + C[idx - 1] + C[idx - grid.nyz - 1]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T > // = C * V [node: C, V, VW]
+void nse::cv_product(T* _RESTRICT CV,
+	const T* _RESTRICT const C, const T* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.5 * C[ijk] * (V[ijk] + V[ij+1k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CV[idx] = (T) 0.5 * C[idx] * (V[idx] + V[idx + grid.nz]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeV) {	// 0.5 * V[ijk] * (C[ijk] + C[ij-1k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j <= grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CV[idx] = (T) 0.5 * V[idx] * (C[idx] + C[idx - grid.nz]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeVW) {	// 0.125 * (V[ijk] + V[ijk-1]) * (C[ijk] + C[ij-1k] + C[ijk-1] + C[ij-1k-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CV[idx] = (T) 0.125 * (V[idx] + V[idx - 1]) *
+					(C[idx] + C[idx - grid.nz] + C[idx - 1] + C[idx - grid.nz - 1]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T > // = C * W [node: C, W, UW, VW]
+void nse::cw_product(T* _RESTRICT CW,
+	const T* _RESTRICT const C, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.5 * C[ijk] * (W[ijk] + W[ijk+1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CW[idx] = (T) 0.5 * C[idx] * (W[idx] + W[idx + 1]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeW) {	// 0.5 * W[ijk] * (C[ijk] + C[ijk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++)
+					CW[idx] = (T) 0.5 * W[idx] * (C[idx] + C[idx - 1]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUW) {	// 0.125 * (W[ijk] + W[i-1jk]) * (C[i-1jk] + C[i-1jk-1] + C[ijk] + C[ijk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CW[idx] = (T) 0.125 *
+					(W[idx] + W[idx - grid.nyz]) *
+					(C[idx - grid.nyz] + C[idx - grid.nyz - 1] + C[idx] + C[idx - 1]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeVW) {	// 0.125 * (W[ijk] + W[ij-1k]) * (C[ij-1k] + C[ij-1k-1] + C[ijk] + C[ijk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CW[idx] = (T) 0.125 *
+					(W[idx] + W[idx - grid.nz]) *
+					(C[idx - grid.nz] + C[idx - grid.nz - 1] + C[idx] + C[idx - 1]);
+			}
+		}
+		return;
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * square(field) - field products * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// = U * U * W [node: UW]
+void nse::u2w_product(T* _RESTRICT U2W,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUW) {	// 0.5 * U[ijk-1] * U[ijk] * (W[ijk] + W[i-1jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( U2W ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( U2W )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					U2W[idx] = (T) 0.5 * U[idx - 1] * U[idx] *
+					(W[idx] + W[idx - grid.nyz]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = U * U * V [node: UV]
+void nse::u2v_product(T* _RESTRICT U2V,
+	const T* _RESTRICT const U, const T* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUV) {	// 0.5 * U[ij-1k] * U[ijk] * (V[ijk] + V[i-1jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( U2V ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( U2V )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					U2V[idx] = (T) 0.5 * U[idx - grid.nz] * U[idx] *
+					(V[idx] + V[idx - grid.nyz]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = U * U * U [node: C]
+void nse::u2u_product(T* _RESTRICT U2U,
+	const T* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.5 * U[i+1jk] * U[ijk] * (U[ijk] + U[i+1jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( U2U ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( U2U )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					U2U[idx] = (T) 0.5 * U[idx + grid.nyz] * U[idx] *
+					(U[idx] + U[idx + grid.nyz]);
+			}
+		}
+		return;
+			}
+		}
+
+template< typename T >	// = V * V * U [node: UV]
+void nse::v2u_product(T* _RESTRICT V2U,
+	const T* _RESTRICT const V, const T* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUV) {	// 0.5 * V[i-1jk] * V[ijk] * (U[ijk] + U[ij-1k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( V2U ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( V2U )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					V2U[idx] = (T) 0.5 * V[idx - grid.nyz] * V[idx] *
+					(U[idx] + U[idx - grid.nz]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = V * V * W [node: VW]
+void nse::v2w_product(T* _RESTRICT V2W,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeVW) {	// 0.5 * V[ijk-1] * V[ijk] * (W[ijk] + W[ij-1k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( V2W ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( V2W )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					V2W[idx] = (T) 0.5 * V[idx - 1] * V[idx] *
+					(W[idx] + W[idx - grid.nz]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = V * V * V [node: C]
+void nse::v2v_product(T* _RESTRICT V2V,
+	const T* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.5 * V[ij+1k] * V[ijk] * (V[ijk] + V[ij+1k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( V2V ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( V2V )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					V2V[idx] = (T) 0.5 * V[idx + grid.nz] * V[idx] *
+					(V[idx] + V[idx + grid.nz]);
+			}
+		}
+		return;
+			}
+		}
+
+template< typename T >	// = W * W * U [node: UW]
+void nse::w2u_product(T* _RESTRICT W2U,
+	const T* _RESTRICT const W, const T* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUW) {	// 0.5 * W[i-1jk] * W[ijk] * (U[ijk] + U[ijk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2U ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2U )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2U[idx] = (T) 0.5 * W[idx - grid.nyz] * W[idx] *
+					(U[idx] + U[idx - 1]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = W * W * V [node: VW]
+void nse::w2v_product(T* _RESTRICT W2V,
+	const T* _RESTRICT const W, const T* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeVW) {	// 0.5 * W[ij-1k] * W[ijk] * (V[ijk] + V[ijk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2V ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2V )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2V[idx] = (T) 0.5 * W[idx - grid.nz] * W[idx] *
+					(V[idx] + V[idx - 1]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = W * W * W [node: C]
+void nse::w2w_product(T* _RESTRICT W2W,
+	const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.5 * W[ijk+1] * W[ijk] * (W[ijk] + W[ijk+1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2W ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2W )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2W[idx] = (T) 0.5 * W[idx + 1] * W[idx] *
+					(W[idx] + W[idx + 1]);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = U * W * W [node: U]
+void nse::uww_product(T* _RESTRICT UWW,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+
+	if (node == nodeU) {
+		//         __________1x  ___1z
+		//         ___1z  ___1z  ___1z
+		// 1/2 * [  W   *  W   *  U    ] +
+		//
+		//         __________1z  ___1z
+		//         ___1x  ___1z  ___1x
+		// 1/2 * [  W   *  U   *  W    ]
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UWW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UWW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					UWW[idx] =
+						(T)0.25 * (
+						(T)0.25 * (W[idx] + W[idx + 1]) * (W[idx] + W[idx + 1]) +
+						(T)0.25 * (W[idx - grid.nyz] + W[idx - grid.nyz + 1]) * (W[idx - grid.nyz] + W[idx - grid.nyz + 1])
+						) * (T)0.25 * (U[idx + 1] + (T)2.0 * U[idx] + U[idx - 1])
+
+						+
+
+						(T)0.25 * (
+						(T)0.25 * (W[idx + 1] + W[idx - grid.nyz + 1]) * (U[idx] + U[idx + 1]) +
+						(T)0.25 * (W[idx] + W[idx - grid.nyz]) * (U[idx] + U[idx - 1])
+						) * (T)0.25 * (W[idx] + W[idx + 1] + W[idx - grid.nyz] + W[idx - grid.nyz + 1]);
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = V * W * W [node: V]
+void nse::vww_product(T* _RESTRICT VWW,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeV) {
+		//         __________1y  ___1z
+		//         ___1z  ___1z  ___1z
+		// 1/2 * [  W   *  W   *  V    ] +
+		//
+		//         __________1z  ___1z
+		//         ___1y  ___1z  ___1y
+		// 1/2 * [  W   *  V   *  W    ]
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VWW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VWW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					VWW[idx] =
+						(T)0.25 * (
+						(T)0.25 * (W[idx] + W[idx + 1]) * (W[idx] + W[idx + 1]) +
+						(T)0.25 * (W[idx - grid.nz] + W[idx - grid.nz + 1]) * (W[idx - grid.nz] + W[idx - grid.nz + 1])
+						) * (T)0.25 * (V[idx + 1] + (T)2.0 * V[idx] + V[idx - 1])
+
+						+
+
+						(T)0.25 * (
+						(T)0.25 * (W[idx + 1] + W[idx - grid.nz + 1]) * (V[idx] + V[idx + 1]) +
+						(T)0.25 * (W[idx] + W[idx - grid.nz]) * (V[idx] + V[idx - 1])
+						) * (T)0.25 * (W[idx] + W[idx + 1] + W[idx - grid.nz] + W[idx - grid.nz + 1]);
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = U * V * W [node: UVW]
+void nse::uvw_product(T* _RESTRICT UVW,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUVW) {
+		//         __________1x  ___1z
+		//         ___1z  ___1y  ___1y
+		// 1/2 * [  V   *  W   *  U    ] +
+		//
+		//         __________1y  ___1z
+		//         ___1z  ___1x  ___1x
+		// 1/2 * [  U   *  W   *  V    ]
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UVW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UVW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					UVW[idx] =
+						(T)0.5 * (
+						(T)0.125 * (
+						(W[idx] + W[idx - grid.nz]) * (V[idx] + V[idx - 1]) +
+						(W[idx - grid.nyz] + W[idx - grid.nyz - grid.nz]) * (V[idx - grid.nyz] + V[idx - grid.nyz - 1])
+						) *
+						(T)0.25 * (U[idx] + U[idx - grid.nz] + U[idx - 1] + U[idx - grid.nz - 1])
+						)
+
+						+
+
+						(T)0.5 * (
+						(T)0.125 * (
+						(W[idx] + W[idx - grid.nyz]) * (U[idx] + U[idx - 1]) +
+						(W[idx - grid.nz] + W[idx - grid.nyz - grid.nz]) * (U[idx - grid.nz] + U[idx - grid.nz - 1])
+						) *
+						(T)0.25 * (V[idx] + V[idx - grid.nyz] + V[idx - 1] + V[idx - grid.nyz - 1])
+						);
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = C * C * U [node: U]
+void nse::c2u_product(T* _RESTRICT C2U,
+	const T* _RESTRICT const C, const T* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeU) {	// C[i-1jk] * C[ijk] * U[ijk]
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C2U ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C2U )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					C2U[idx] = C[idx - grid.nyz] * C[idx] * U[idx];
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = C * C * V [node: V]
+void nse::c2v_product(T* _RESTRICT C2V,
+	const T* _RESTRICT const C, const T* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeV) {	// C[ij-1k] * C[ijk] * V[ijk]
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C2V ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C2V )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					C2V[idx] = C[idx - grid.nz] * C[idx] * V[idx];
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = C * C * W [node: W]
+void nse::c2w_product(T* _RESTRICT C2W,
+	const T* _RESTRICT const C, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeW) {	// C[ijk-1] * C[ijk] * W[ijk]
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C2W ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C2W )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					C2W[idx] = C[idx - 1] * C[idx] * W[idx];
+			}
+		}
+		return;
+	}
+}
+
+
+template< typename T >	// = C * U * W [node: UW]
+void nse::cuw_product(T* _RESTRICT CUW,
+	const T* _RESTRICT const C,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUW) {
+		//                       ___1z    _______1x
+		//       ___1z    ___1x  ___1x        ___1z
+		// 1/2 *  U   * [  W   *  C    +  W *  C    ]
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CUW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CUW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CUW[idx] = (T)0.25 * (U[idx] + U[idx - 1]) * (
+
+					(T)0.125 * (W[idx] + W[idx - grid.nyz]) *
+					(C[idx] + C[idx - 1] + C[idx - grid.nyz] + C[idx - grid.nyz - 1]) +
+
+					(T)0.25 * (W[idx] * (C[idx] + C[idx - 1]) + W[idx - grid.nyz] * (C[idx - grid.nyz] + C[idx - grid.nyz - 1]))
+
+					);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = C * V * W [node: VW]
+void nse::cvw_product(T* _RESTRICT CVW,
+	const T* _RESTRICT const C,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeVW) {
+		//                       ___1z    _______1y
+		//       ___1z    ___1y  ___1y        ___1z
+		// 1/2 *  V   * [  W   *  C    +  W *  C    ]
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CVW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CVW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CVW[idx] = (T)0.25 * (V[idx] + V[idx - 1]) * (
+
+					(T)0.125 * (W[idx] + W[idx - grid.nz]) *
+					(C[idx] + C[idx - 1] + C[idx - grid.nz] + C[idx - grid.nz - 1]) +
+
+					(T)0.25 * (W[idx] * (C[idx] + C[idx - 1]) + W[idx - grid.nz] * (C[idx - grid.nz] + C[idx - grid.nz - 1]))
+
+					);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = C * W * W [node: C]
+void nse::cww_product(T* _RESTRICT CWW,
+	const T* _RESTRICT const C, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {
+		//                       ___1z    _______1z
+		//       ___1z    ___1z  ___1z        ___1z
+		// 1/2 *  W   * [  W   *  C    +  W *  C    ]
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CWW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CWW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CWW[idx] = (T)0.25 * (W[idx] + W[idx + 1]) * (
+
+					(T)0.125 * (W[idx] + W[idx + 1]) * (C[idx + 1] + (T)2.0 * C[idx] + C[idx - 1]) +
+
+					(T)0.25 * (W[idx] * (C[idx] + C[idx - 1]) + W[idx + 1] * (C[idx] + C[idx + 1]))
+					);
+			}
+		}
+		return;
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * product partitions * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// = U * W [node: U (UW -- U), UV (UVW -- UV)]
+void nse::uw_product_partition(T* _RESTRICT UW_Bottom, T* _RESTRICT UW_Top,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeU) {
+		// 1/2 * U[ijk] * (W[ijk] + W[i-1jk])
+		// 1/2 * U[ijk] * (W[ijk+1] + W[i-1jk+1])
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW_Bottom, UW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW_Bottom, UW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					UW_Bottom[idx] = (T) 0.5 * U[idx] * (W[idx - grid.nyz] + W[idx]);
+					UW_Top[idx] = (T) 0.5 * U[idx] * (W[idx - grid.nyz + 1] + W[idx + 1]);
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUV) {
+		// 1/8 * (U[ijk] + U[ij-1k]) * (W[ijk] + W[i-1jk] + W[ij-1k] + W[i-1j-1k])
+		// 1/8 * (U[ijk] + U[ij-1k]) * (W[ijk+1] + W[i-1jk+1] + W[ij-1k+1] + W[i-1j-1k+1])
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW_Bottom, UW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW_Bottom, UW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					UW_Bottom[idx] = (T)0.125 * (U[idx] + U[idx - grid.nz]) *
+						(W[idx] + W[idx - grid.nyz] + W[idx - grid.nz] + W[idx - grid.nyz - grid.nz]);
+					UW_Top[idx] = (T)0.125 * (U[idx] + U[idx - grid.nz]) *
+						(W[idx + 1] + W[idx - grid.nyz + 1] + W[idx - grid.nz + 1] + W[idx - grid.nyz - grid.nz + 1]);
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUW) {
+		// 1/8 * (U[ijk] + U[ijk-1]) * (W[ijk] + W[i-1jk] + W[ijk-1] + W[i-1jk-1])
+		// 1/8 * (U[ijk] + U[ijk-1]) * (W[ijk] + W[i-1jk] + W[ijk+1] + W[i-1jk+1])
+		//
+		// U*W-[bottom] at (gcz + 1, nz - gcz) nodes
+		// U*W-[top] at (gcz, nz - gcz - 1) nodes
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW_Bottom, UW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW_Bottom, UW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+					UW_Bottom[idx] = (T)0.125 * (U[idx] + U[idx - 1]) *
+						(W[idx] + W[idx - grid.nyz] + W[idx - 1] + W[idx - grid.nyz - 1]);
+					UW_Top[idx] = (T)0.125 * (U[idx] + U[idx - 1]) *
+						(W[idx] + W[idx - grid.nyz] + W[idx + 1] + W[idx - grid.nyz + 1]);
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = V * W [node: between VW and V]
+void nse::vw_product_partition(T* _RESTRICT VW_Bottom, T* _RESTRICT VW_Top,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeV) {
+		// 1/2 * V[ijk] * (W[ijk] + W[ij-1k])
+		// 1/2 * V[ijk] * (W[ijk+1] + W[ij-1k+1])
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW_Bottom, VW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW_Bottom, VW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					VW_Bottom[idx] = (T) 0.5 * V[idx] * (W[idx - grid.nz] + W[idx]);
+					VW_Top[idx] = (T) 0.5 * V[idx] * (W[idx - grid.nz + 1] + W[idx + 1]);
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUV) {
+		// 1/8 * (V[ijk] + V[i-1jk]) * (W[ijk] + W[i-1jk] + W[ij-1k] + W[i-1j-1k])
+		// 1/8 * (V[ijk] + V[i-1jk]) * (W[ijk+1] + W[i-1jk+1] + W[ij-1k+1] + W[i-1j-1k+1])
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW_Bottom, VW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW_Bottom, VW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					VW_Bottom[idx] = (T)0.125 * (V[idx] + V[idx - grid.nyz]) *
+						(W[idx] + W[idx - grid.nyz] + W[idx - grid.nz] + W[idx - grid.nyz - grid.nz]);
+					VW_Top[idx] = (T)0.125 * (V[idx] + V[idx - grid.nyz]) *
+						(W[idx + 1] + W[idx - grid.nyz + 1] + W[idx - grid.nz + 1] + W[idx - grid.nyz - grid.nz + 1]);
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeVW) {
+		// 1/8 * (V[ijk] + V[ijk-1]) * (W[ijk] + W[ij-1k] + W[ijk-1] + W[ij-1k-1])
+		// 1/8 * (V[ijk] + V[ijk-1]) * (W[ijk] + W[ij-1k] + W[ijk+1] + W[ij-1k+1])
+		//
+		// V*W-[bottom] at (gcz + 1, nz - gcz) nodes
+		// V*W-[top] at (gcz, nz - gcz - 1) nodes
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW_Bottom, VW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW_Bottom, VW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+					VW_Bottom[idx] = (T)0.125 * (V[idx] + V[idx - 1]) *
+						(W[idx] + W[idx - grid.nz] + W[idx - 1] + W[idx - grid.nz - 1]);
+					VW_Top[idx] = (T)0.125 * (V[idx] + V[idx - 1]) *
+						(W[idx] + W[idx - grid.nz] + W[idx + 1] + W[idx - grid.nz + 1]);
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = C * W [node: between W and C]
+void nse::cw_product_partition(T* _RESTRICT CW_Bottom, T* _RESTRICT CW_Top,
+	const T* _RESTRICT const C, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {
+		// C[ijk] * W[ijk]
+		// C[ijk] * W[ijk+1]
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW_Bottom, CW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW_Bottom, CW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					CW_Bottom[idx] = C[idx] * W[idx];
+					CW_Top[idx] = C[idx] * W[idx + 1];
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeU) {
+		// 1/4 * (C[ijk] + C[i-1jk]) * (W[ijk] + W[i-1jk])
+		// 1/4 * (C[ijk] + C[i-1jk]) * (W[ijk+1] + W[i-1jk+1])
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW_Bottom, CW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW_Bottom, CW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					CW_Bottom[idx] = (T)0.25 * (C[idx] + C[idx - grid.nyz]) * (W[idx] + W[idx - grid.nyz]);
+					CW_Top[idx] = (T)0.25 * (C[idx] + C[idx - grid.nyz]) * (W[idx + 1] + W[idx - grid.nyz + 1]);
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeV) {
+		// 1/4 * (C[ijk] + C[ij-1k]) * (W[ijk] + W[ij-1k])
+		// 1/4 * (C[ijk] + C[ij-1k]) * (W[ijk+1] + W[ij-1k+1])
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW_Bottom, CW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW_Bottom, CW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					CW_Bottom[idx] = (T)0.25 * (C[idx] + C[idx - grid.nz]) * (W[idx] + W[idx - grid.nz]);
+					CW_Top[idx] = (T)0.25 * (C[idx] + C[idx - grid.nz]) * (W[idx + 1] + W[idx - grid.nz + 1]);
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeW) {
+		// 1/4 * (C[ijk] + C[ijk-1]) * (W[ijk] + W[ijk-1])
+		// 1/4 * (C[ijk] + C[ijk-1]) * (W[ijk] + W[ijk+1])
+		//
+		// C*W-[bottom] at (gcz + 1, nz - gcz) nodes
+		// C*W-[top] at (gcz, nz - gcz - 1) nodes
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW_Bottom, CW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW_Bottom, CW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+					CW_Bottom[idx] = (T)0.25 * (C[idx] + C[idx - 1]) * (W[idx] + W[idx - 1]);
+					CW_Top[idx] = (T)0.25 * (C[idx] + C[idx - 1]) * (W[idx] + W[idx + 1]);
+				}
+			}
+		}
+		return;
+	}
+}
+// ------------------------------------------------------------------------ //
+
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+
+// * initialize: node-add(sub) routines * //
+template void nse::c_add(float* _RESTRICT X, const float* _RESTRICT const Q,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::c_add(double* _RESTRICT X, const double* _RESTRICT const Q,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::c_sub(float* _RESTRICT X, const float* const Q,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::c_sub(double* _RESTRICT X, const double* const Q,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * initialize: field products * //
+template void nse::u_square(float* _RESTRICT U2, const float* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::u_square(double* _RESTRICT U2, const double* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::v_square(float* _RESTRICT V2, const float* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::v_square(double* _RESTRICT V2, const double* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::w_square(float* _RESTRICT W2, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::w_square(double* _RESTRICT W2, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::c_square(float* _RESTRICT C2, const float* _RESTRICT const C,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::c_square(double* _RESTRICT C2, const double* _RESTRICT const C,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::uu_product(float* _RESTRICT UU,
+	const float* _RESTRICT const Ua, const float* _RESTRICT const Ub,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uu_product(double* _RESTRICT UU,
+	const double* _RESTRICT const Ua, const double* _RESTRICT const Ub,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::vv_product(float* _RESTRICT VV,
+	const float* _RESTRICT const Va, const float* _RESTRICT const Vb,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::vv_product(double* _RESTRICT VV,
+	const double* _RESTRICT const Va, const double* _RESTRICT const Vb,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::ww_product(float* _RESTRICT WW,
+	const float* _RESTRICT const Wa, const float* _RESTRICT const Wb,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::ww_product(double* _RESTRICT WW,
+	const double* _RESTRICT const Wa, const double* _RESTRICT const Wb,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cc_product(float* _RESTRICT CC,
+	const float* _RESTRICT const Ca, const float* _RESTRICT const Cb,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cc_product(double* _RESTRICT CC,
+	const double* _RESTRICT const Ca, const double* _RESTRICT const Cb,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::uv_product(float* _RESTRICT UV,
+	const float* _RESTRICT const U, const float* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uv_product(double* _RESTRICT UV,
+	const double* _RESTRICT const U, const double* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::uw_product(float* _RESTRICT UW,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uw_product(double* _RESTRICT UW,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::vw_product(float* _RESTRICT VW,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::vw_product(double* _RESTRICT VW,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cu_product(float* _RESTRICT CU,
+	const float* _RESTRICT const C, const float* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cu_product(double* _RESTRICT CU,
+	const double* _RESTRICT const C, const double* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cv_product(float* _RESTRICT CV,
+	const float* _RESTRICT const C, const float* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cv_product(double* _RESTRICT CV,
+	const double* _RESTRICT const C, const double* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cw_product(float* _RESTRICT CW,
+	const float* _RESTRICT const C, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cw_product(double* _RESTRICT CW,
+	const double* _RESTRICT const C, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: square(field) - field products * //
+template void nse::u2w_product(float* _RESTRICT U2W,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::u2w_product(double* _RESTRICT U2W,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::u2v_product(float* _RESTRICT U2V,
+	const float* _RESTRICT const U, const float* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::u2v_product(double* _RESTRICT U2V,
+	const double* _RESTRICT const U, const double* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::u2u_product(float* _RESTRICT U2U,
+	const float* _RESTRICT const U, const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::u2u_product(double* _RESTRICT U2U,
+	const double* _RESTRICT const U, const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::v2u_product(float* _RESTRICT V2U,
+	const float* _RESTRICT const V, const float* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::v2u_product(double* _RESTRICT V2U,
+	const double* _RESTRICT const V, const double* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::v2w_product(float* _RESTRICT V2W,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::v2w_product(double* _RESTRICT V2W,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::v2v_product(float* _RESTRICT V2V,
+	const float* _RESTRICT const V, const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::v2v_product(double* _RESTRICT V2V,
+	const double* _RESTRICT const V, const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::w2u_product(float* _RESTRICT W2U,
+	const float* _RESTRICT const W, const float* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::w2u_product(double* _RESTRICT W2U,
+	const double* _RESTRICT const W, const double* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::w2v_product(float* _RESTRICT W2V,
+	const float* _RESTRICT const W, const float* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::w2v_product(double* _RESTRICT W2V,
+	const double* _RESTRICT const W, const double* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::w2w_product(float* _RESTRICT W2W,
+	const float* _RESTRICT const W, const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::w2w_product(double* _RESTRICT W2W,
+	const double* _RESTRICT const W, const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::uww_product(float* _RESTRICT UWW,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uww_product(double* _RESTRICT UWW,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::vww_product(float* _RESTRICT VWW,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::vww_product(double* _RESTRICT VWW,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::uvw_product(float* _RESTRICT UVW,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uvw_product(double* _RESTRICT UVW,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+
+template void nse::c2u_product(float* _RESTRICT C2U,
+	const float* _RESTRICT const C, const float* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::c2u_product(double* _RESTRICT C2U,
+	const double* _RESTRICT const C, const double* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::c2v_product(float* _RESTRICT C2V,
+	const float* _RESTRICT const C, const float* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::c2v_product(double* _RESTRICT C2V,
+	const double* _RESTRICT const C, const double* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::c2w_product(float* _RESTRICT C2W,
+	const float* _RESTRICT const C, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::c2w_product(double* _RESTRICT C2W,
+	const double* _RESTRICT const C, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cuw_product(float* _RESTRICT CUW,
+	const float* _RESTRICT const C,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cuw_product(double* _RESTRICT CUW,
+	const double* _RESTRICT const C,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cvw_product(float* _RESTRICT CVW,
+	const float* _RESTRICT const C,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cvw_product(double* _RESTRICT CVW,
+	const double* _RESTRICT const C,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cww_product(float* _RESTRICT CWW,
+	const float* _RESTRICT const C, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cww_product(double* _RESTRICT CWW,
+	const double* _RESTRICT const C, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: product partitions * //
+template void nse::uw_product_partition(float* _RESTRICT UW_Bottom, float* _RESTRICT UW_Top,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uw_product_partition(double* _RESTRICT UW_Bottom, double* _RESTRICT UW_Top,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::vw_product_partition(float* _RESTRICT VW_Bottom, float* _RESTRICT VW_Top,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::vw_product_partition(double* _RESTRICT VW_Bottom, double* _RESTRICT VW_Top,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cw_product_partition(float* _RESTRICT CW_Bottom, float* _RESTRICT CW_Top,
+	const float* _RESTRICT const C, const float* _RESTRICT const W, 
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cw_product_partition(double* _RESTRICT CW_Bottom, double* _RESTRICT CW_Top,
+	const double* _RESTRICT const C, const double* _RESTRICT const W, 
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
diff --git a/nse-fops3d-x2.h b/nse-fops3d-x2.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfd6a0e8395061b93e680637f9a33dcb424e14b8
--- /dev/null
+++ b/nse-fops3d-x2.h
@@ -0,0 +1,175 @@
+#pragma once
+
+// [nse-fops3d-x2.h(cpp)]: 3D Navier-Stokes module (field operations) -X2
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "grid3d.h"
+
+
+namespace nse
+{
+	// * node-add(sub) routines * //
+	template< typename T > // X = X + Q [Q = U, V, W, C]
+	void c_add(T* _RESTRICT X, const T* _RESTRICT const Q,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T > // X = X - Q [Q = U, V, W, C]
+	void c_sub(T* _RESTRICT X, const T* _RESTRICT const Q,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * field products * //
+	template< typename T >	// = U * U [node: C, U, UV, UW]
+	void u_square(T* _RESTRICT U2, const T* _RESTRICT const U,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = V * V [node: C, V, UV, VW]
+	void v_square(T* _RESTRICT V2, const T* _RESTRICT const V,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = W * W [node: C, U, V, W, UW, VW]
+	void w_square(T* _RESTRICT W2, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = C * C [node: C, U, V, W]
+	void c_square(T* _RESTRICT C2, const T* _RESTRICT const C,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = Ua * Ub [node: C, U, UV, UW]
+	void uu_product(T* _RESTRICT UU,
+		const T* _RESTRICT const Ua, const T* _RESTRICT const Ub,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = V1 * V2 [node: C, V, UV, VW]
+	void vv_product(T* _RESTRICT VV,
+		const T* _RESTRICT const Va, const T* _RESTRICT const Vb,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = W1 * W2 [node: C, W, UW, VW]
+	void ww_product(T* _RESTRICT WW,
+		const T* _RESTRICT const Wa, const T* _RESTRICT const Wb,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = C1 * C2 [node: C, U, V, W]
+	void cc_product(T* _RESTRICT CC,
+		const T* _RESTRICT const Ca, const T* _RESTRICT const Cb,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = U * V [node: C, UV, UVW]
+	void uv_product(T* _RESTRICT UV,
+		const T* _RESTRICT const U, const T* _RESTRICT const V,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = U * W [node: C, UW, UVW]
+	void uw_product(T* _RESTRICT UW,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = V * W [node: C, VW, UVW]
+	void vw_product(T* _RESTRICT VW,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T > // = C * U [node: C, U, UW]
+	void cu_product(T* _RESTRICT CU,
+		const T* _RESTRICT const C, const T* _RESTRICT const U,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T > // = C * V [node: C, V, VW]
+	void cv_product(T* _RESTRICT CV,
+		const T* _RESTRICT const C, const T* _RESTRICT const V,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T > // = C * W [node: C, W, UW, VW]
+	void cw_product(T* _RESTRICT CW,
+		const T* _RESTRICT const C, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+	// * square(field) - field products * //
+	template< typename T >	// = U * U * W [node: UW]
+	void u2w_product(T* _RESTRICT U2W,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = U * U * V [node: UV]
+	void u2v_product(T* _RESTRICT U2V,
+		const T* _RESTRICT const U, const T* _RESTRICT const V,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = U * U * U [node: C]
+	void u2u_product(T* _RESTRICT U2U, const T* _RESTRICT const U,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = V * V * U [node: UV]
+	void v2u_product(T* _RESTRICT V2U,
+		const T* _RESTRICT const V, const T* _RESTRICT const U,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = V * V * W [node: VW]
+	void v2w_product(T* _RESTRICT V2W,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = V * V * V [node: C]
+	void v2v_product(T* _RESTRICT V2V, const T* _RESTRICT const V,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = W * W * U [node: UW]
+	void w2u_product(T* _RESTRICT W2U,
+		const T* _RESTRICT const W, const T* _RESTRICT const U,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = W * W * V [node: VW]
+	void w2v_product(T* _RESTRICT W2V,
+		const T* _RESTRICT const W, const T* _RESTRICT const V,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = W * W * W [node: C]
+	void w2w_product(T* _RESTRICT W2W, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = U * W * W [node: U]
+	void uww_product(T* _RESTRICT UWW,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = V * W * W [node: V]
+	void vww_product(T* _RESTRICT VWW,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = U * V * W [node: UVW]
+	void uvw_product(T* _RESTRICT UVW,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+
+	template< typename T >	// = C * C * U [node: U]
+	void c2u_product(T* _RESTRICT C2U,
+		const T* _RESTRICT const C, const T* _RESTRICT const U,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = C * C * V [node: V]
+	void c2v_product(T* _RESTRICT C2V,
+		const T* _RESTRICT const C, const T* _RESTRICT const V,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = C * C * W [node: W]
+	void c2w_product(T* _RESTRICT C2W,
+		const T* _RESTRICT const C, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = C * U * W [node: UW]
+	void cuw_product(T* _RESTRICT CUW,
+		const T* _RESTRICT const C,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = C * V * W [node: VW]
+	void cvw_product(T* _RESTRICT CVW,
+		const T* _RESTRICT const C,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = C * W * W [node: C]
+	void cww_product(T* _RESTRICT CWW,
+		const T* _RESTRICT const C, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+	template< typename T >	// = U * W [node: U (UW -- U), UV (UVW -- UV), UW (U -- UW)]
+	void uw_product_partition(T* _RESTRICT UW_Bottom, T* _RESTRICT UW_Top,
+		const T* _RESTRICT const U, const T* _RESTRICT const W, 
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = V * W [node: V (VW -- V), UV (UVW -- UV), VW (V -- VW)]
+	void vw_product_partition(T* _RESTRICT VW_Bottom, T* _RESTRICT VW_Top,
+		const T* _RESTRICT const V, const T* _RESTRICT const W, 
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = C * W [node: C (W -- C), U (UW -- U), V (VW -- V), W (C -- W)]
+	void cw_product_partition(T* _RESTRICT CW_Bottom, T* _RESTRICT CW_Top,
+		const T* _RESTRICT const C, const T* _RESTRICT const W, 
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+}
diff --git a/nse-fops3d-x4.cpp b/nse-fops3d-x4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bf71b0c109776975a899e7a616cdededcc985f0
--- /dev/null
+++ b/nse-fops3d-x4.cpp
@@ -0,0 +1,1852 @@
+#include "nse-fops3d-x4.h"
+
+
+using namespace nse::nse_const3d;
+
+// * node-add(sub) routines * //
+// ------------------------------------------------------------------------ //
+template< typename T > // X = X + Q [Q = U, V, W, C]
+void nse::c_add_x4(T* _RESTRICT X, const T* _RESTRICT const Q,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeU) ? grid.nyz :
+		(node == nodeV) ? grid.nz :
+		(node == nodeW) ? 1 :
+		0;	// default: [nodeC]
+
+	const T C1 = (T) 9.0 / (T) 16.0,
+		C2 = (T) 1.0 / (T) 16.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				X[idx] += C1 * (Q[idx] + Q[idx + opidx]) -
+				C2 * (Q[idx + 2 * opidx] + Q[idx - opidx]);
+		}
+	}
+}
+
+template< typename T > // X = X - Q [Q = U, V, W, C]
+void nse::c_sub_x4(T* _RESTRICT X, const T* _RESTRICT const Q,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const int opidx = (node == nodeU) ? grid.nyz :
+		(node == nodeV) ? grid.nz :
+		(node == nodeW) ? 1 :
+		0;	// default: [nodeC]
+
+	const T C1 = (T) 9.0 / (T) 16.0,
+		C2 = (T) 1.0 / (T) 16.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+				X[idx] -= C1 * (Q[idx] + Q[idx + opidx]) -
+				C2 * (Q[idx + 2 * opidx] + Q[idx - opidx]);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * field products * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// = W * W [node: C, W, UW, VW]
+void nse::w_square_x4(T* _RESTRICT W2, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if ((node == nodeC) || (node == nodeW)) {
+
+		const int opidx = (node == nodeC) ? 1 :
+			0;	// default: [nodeW]
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2 ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2[idx] = W[idx + opidx] * W[idx];
+			}
+		}
+
+		return;
+	}
+
+	if (node == nodeU) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2 ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2[idx] = (T)0.25 *
+					(C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz])) *
+					(C1 * (W[idx + 1] + W[idx - grid.nyz + 1]) - _3C2 * (W[idx + grid.nyz + 1] + W[idx - 2 * grid.nyz + 1]));
+			}
+		}
+
+		return;
+	}
+
+	if (node == nodeV) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2 ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2[idx] = (T)0.25 *
+					(C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz])) *
+					(C1 * (W[idx + 1] + W[idx - grid.nz + 1]) - _3C2 * (W[idx + grid.nz + 1] + W[idx - 2 * grid.nz + 1]));
+			}
+		}
+
+		return;
+	}
+
+	if (node == nodeUW) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2 ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2[idx] = (T)0.25 *
+					(C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz])) *
+					(C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz]));
+			}
+		}
+
+		return;
+	}
+
+	if (node == nodeVW) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W2 ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					W2[idx] = (T)0.25 *
+					(C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz])) *
+					(C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz]));
+			}
+		}
+
+		return;
+	}
+}
+
+template< typename T >	// = U * V [node: C, UV, UVW]
+void nse::uv_product_x4(T* _RESTRICT UV,
+	const T* _RESTRICT const U, const T* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.25 * (U[i(-1)jk] + U[i+1(+2)jk]) * (V[ij(-1)k] + V[ij+1(+2)k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UV[idx] = (T) 0.25 *
+					(C1 * (U[idx] + U[idx + grid.nyz]) - _3C2 * (U[idx + 2 * grid.nyz] + U[idx - grid.nyz])) * 
+					(C1 * (V[idx] + V[idx + grid.nz]) - _3C2 * (V[idx + 2 * grid.nz] + V[idx - grid.nz]));
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUV) {	// 0.25 * (U[ij(+1)k] + U[ij-1(-2)k]) * (V[i(+1)jk] + V[i-1(-2)jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UV[idx] = (T) 0.25 *
+					(C1 * (U[idx] + U[idx - grid.nz]) - _3C2 * (U[idx + grid.nz] + U[idx - 2 * grid.nz])) *
+					(C1 * (V[idx] + V[idx - grid.nyz]) - _3C2 * (V[idx + grid.nyz] + V[idx - 2 * grid.nyz]));
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUVW) {	// 0.25 * (U[ij(+1)k] + U[ij-1(-2)k] + U[ij(+1)k-1] + U[ij-1(-2)k-1]) * 
+							// 0.25 * (V[i(+1)jk] + V[i-1(-2)jk] + V[i(+1)jk-1] + V[i-1(-2)jk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UV[idx] = (T) 0.25 * (T)0.25 *
+					(C1 * (U[idx] + U[idx - grid.nz] + U[idx - 1] + U[idx - grid.nz - 1]) -
+					_3C2 * (U[idx + grid.nz] + U[idx - 2 * grid.nz] + U[idx + grid.nz - 1] + U[idx - 2 * grid.nz - 1])) *
+					(C1 * (V[idx] + V[idx - grid.nyz] + V[idx - 1] + V[idx - grid.nyz - 1]) -
+					_3C2 * (V[idx + grid.nyz] + V[idx - 2 * grid.nyz] + V[idx + grid.nyz - 1] + V[idx - 2 * grid.nyz - 1]));
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = U * W [node: C, UW, UVW]
+void nse::uw_product_x4(T* _RESTRICT UW,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.25 * (U[i(-1)jk] + U[i+1(+(2)jk]) * (W[ijk] + W[ijk+1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UW[idx] = (T) 0.25 *
+					(C1 * (U[idx] + U[idx + grid.nyz]) -
+					_3C2 * (U[idx + 2 * grid.nyz] + U[idx - grid.nyz])) *
+					(W[idx] + W[idx + 1]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUW) {	// 0.25 * (U[ijk] + U[ijk-1]) * (W[i(+1)jk] + W[i-1(-2)jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UW[idx] = (T) 0.25 *
+					(U[idx] + U[idx - 1]) *
+					(C1 * (W[idx] + W[idx - grid.nyz]) -
+					_3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz]));
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUVW) {	// 0.25 * (U[ij(+1)k] + U[ij(+1)k-1] + U[ij-1(-2)k] + U[ij-1(-2)k-1]) * 
+							// 0.25 * [(W[ijk] + W[i-1jk] + W[ij-1k] + W[i-1j-1k])] - 4th order approximation
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					UW[idx] = (T)0.25 * (T)0.25 *
+					(C1 * (U[idx] + U[idx - grid.nz] + U[idx - 1] + U[idx - grid.nz - 1]) -
+					_3C2 * (U[idx + grid.nz] + U[idx - 2 * grid.nz] + U[idx + grid.nz - 1] + U[idx - 2 * grid.nz - 1])) *
+					(
+					C1 * C1 * 
+					(W[idx] + W[idx - grid.nyz] + W[idx - grid.nz] + W[idx - grid.nyz - grid.nz]) -
+					C1 * _3C2 * (
+					W[idx + grid.nyz] + W[idx - 2 * grid.nyz] + W[idx + grid.nyz - grid.nz] + W[idx - 2 * grid.nyz - grid.nz] +
+					W[idx + grid.nz] + W[idx - grid.nyz + grid.nz] + W[idx - 2 * grid.nz] + W[idx - grid.nyz - 2 * grid.nz]
+					) +
+					_3C2 * _3C2 * (
+					W[idx + grid.nyz + grid.nz] + W[idx - 2 * grid.nyz + grid.nz] + 
+					W[idx + grid.nyz - 2 * grid.nz] + W[idx - 2 * grid.nyz - 2 * grid.nz])
+					);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = V * W [node: C, VW, UVW]
+void nse::vw_product_x4(T* _RESTRICT VW,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.25 * (V[i(-1)jk] + V[ij+1(+2)k]) * (W[ijk] + W[ijk+1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					VW[idx] = (T) 0.25 *
+					(C1 * (V[idx] + V[idx + grid.nz]) -
+					_3C2 * (V[idx + 2 * grid.nz] + V[idx - grid.nz])) *
+					(W[idx] + W[idx + 1]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeVW) {	// 0.25 * (V[ijk] + V[ijk-1]) * (W[i(+1)jk] + W[ij-1(-2)k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					VW[idx] = (T) 0.25 *
+					(V[idx] + V[idx - 1]) *
+					(C1 * (W[idx] + W[idx - grid.nz]) -
+					_3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz]));
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUVW) {	// 0.25 * (V[i(+1)jk] + V[i(+1)jk-1] + V[i-1(-2)jk] + V[i-1(-2)jk-1]) * 
+							// 0.25 * [(W[ijk] + W[ij-1k] + W[i-1jk] + W[i-1j-1k])] - 4th order approximation
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					VW[idx] = (T)0.25 * (T)0.25 *
+					(C1 * (V[idx] + V[idx - 1] + V[idx - grid.nyz] + V[idx - grid.nyz - 1]) -
+					_3C2 * (V[idx + grid.nyz] + V[idx + grid.nyz - 1] + V[idx - 2 * grid.nyz] + V[idx - 2 * grid.nyz - 1])) *
+					(
+					C1 * C1 *
+					(W[idx] + W[idx - grid.nyz] + W[idx - grid.nz] + W[idx - grid.nyz - grid.nz]) -
+					C1 * _3C2 * (
+					W[idx + grid.nyz] + W[idx - 2 * grid.nyz] + W[idx + grid.nyz - grid.nz] + W[idx - 2 * grid.nyz - grid.nz] +
+					W[idx + grid.nz] + W[idx - grid.nyz + grid.nz] + W[idx - 2 * grid.nz] + W[idx - grid.nyz - 2 * grid.nz]
+					) +
+					_3C2 * _3C2 * (
+					W[idx + grid.nyz + grid.nz] + W[idx - 2 * grid.nyz + grid.nz] +
+					W[idx + grid.nyz - 2 * grid.nz] + W[idx - 2 * grid.nyz - 2 * grid.nz])
+					);
+			}
+		}
+		return;
+	}
+}
+
+
+template< typename T > // = C * U [node: C, U, UW]
+void nse::cu_product_x4(T* _RESTRICT CU,
+	const T* _RESTRICT const C, const T* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.5 * C[ijk] * (U[i(+2)jk] + U[i+1(-1)jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CU ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CU )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CU[idx] = (T) 0.5 * C[idx] *
+					(C1 * (U[idx] + U[idx + grid.nyz])
+					- _3C2 * (U[idx + 2 * grid.nyz] + U[idx - grid.nyz]));
+			}
+		}
+		return;
+	}
+
+	if (node == nodeU) {	// 0.5 * U[ijk] * (C[i(+1)jk] + C[i-1(-2)jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CU ) collapse( 2 )
+		for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CU )
+		for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CU[idx] = (T) 0.5 * U[idx] *
+					(C1 * (C[idx] + C[idx - grid.nyz])
+					- _3C2 * (C[idx + grid.nyz] + C[idx - 2 * grid.nyz]));
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUW) {	// 0.125 * (U[ijk] + U[ijk-1]) * (C[i(+1)jk] + C[i-1(-2)jk] + C[i(+1)jk-1] + C[i-1(-2)jk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CU ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CU )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CU[idx] = (T) 0.125 * (U[idx] + U[idx - 1]) *
+					(C1 * (C[idx] + C[idx - grid.nyz] + C[idx - 1] + C[idx - grid.nyz - 1]) -
+					_3C2 * (C[idx + grid.nyz] + C[idx - 2 * grid.nyz] + C[idx + grid.nyz - 1] + C[idx - 2 * grid.nyz - 1]));
+			}
+		}
+		return;
+	}
+}
+
+template< typename T > // = C * V [node: C, V, VW]
+void nse::cv_product_x4(T* _RESTRICT CV,
+	const T* _RESTRICT const C, const T* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.5 * C[ijk] * (V[ij(+2)k] + V[ij+1(-1)k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CV[idx] = (T) 0.5 * C[idx] * (
+					C1 * (V[idx] + V[idx + grid.nz])
+					- _3C2 * (V[idx + 2 * grid.nz] + V[idx - grid.nz]));
+			}
+		}
+		return;
+	}
+
+	if (node == nodeV) {	// 0.5 * V[ijk] * (C[ij(+1)k] + C[ij-1(-2)k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j <= grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CV[idx] = (T) 0.5 * V[idx] * (
+					C1 * (C[idx] + C[idx - grid.nz])
+					- _3C2 * (C[idx + grid.nz] + C[idx - 2 * grid.nz]));
+			}
+		}
+		return;
+	}
+
+	if (node == nodeVW) {	// 0.125 * (V[ijk] + V[ijk-1]) * (C[ij(+1)k] + C[ij-1(-2)k] + C[ij(+1)k-1] + C[ij-1(-2)k-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CV ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CV )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CV[idx] = (T) 0.125 * (V[idx] + V[idx - 1]) *
+					(C1 * (C[idx] + C[idx - grid.nz] + C[idx - 1] + C[idx - grid.nz - 1]) -
+					_3C2 * (C[idx + grid.nz] + C[idx - 2 * grid.nz] + C[idx + grid.nz - 1] + C[idx - 2 * grid.nz - 1]));
+			}
+		}
+		return;
+	}
+}
+
+template< typename T > // = C * W [node: C, W, UW, VW]
+void nse::cw_product_x4(T* _RESTRICT CW,
+	const T* _RESTRICT const C, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {	// 0.5 * C[ijk] * (W[ijk] + W[ijk+1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CW[idx] = (T) 0.5 * C[idx] * (W[idx] + W[idx + 1]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeW) {	// 0.5 * W[ijk] * (C[ijk] + C[ijk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++)
+					CW[idx] = (T) 0.5 * W[idx] * (C[idx] + C[idx - 1]);
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUW) {	// 0.125 * (W[i(+1)jk] + W[i-1(-2)jk]) * (C[i-1(-2)jk] + C[i-1(-2)jk-1] + C[i(+1)jk] + C[i(+1)jk-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CW[idx] = (T) 0.125 *
+					(C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz])) *
+					(C1 * (C[idx - grid.nyz] + C[idx - grid.nyz - 1] + C[idx] + C[idx - 1]) -
+					_3C2 * (C[idx - 2 * grid.nyz] + C[idx - 2 * grid.nyz - 1] + C[idx + grid.nyz] + C[idx + grid.nyz - 1]));
+			}
+		}
+		return;
+	}
+
+	if (node == nodeVW) {	// 0.125 * (W[ij(+1)k] + W[ij-1(-2)k]) * (C[ij-1(-2)k] + C[ij-1(-2)k-1] + C[ij(+1)k] + C[ij(+1)k-1])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CW[idx] = (T) 0.125 *
+					(C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz])) *
+					(C1 * (C[idx - grid.nz] + C[idx - grid.nz - 1] + C[idx] + C[idx - 1]) - 
+					_3C2 * (C[idx - 2 * grid.nz] + C[idx - 2 * grid.nz - 1] + C[idx + grid.nz] + C[idx + grid.nz - 1]));
+			}
+		}
+		return;
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * square(field) - field products * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// = U * U * W [node: UW]
+void nse::u2w_product_x4(T* _RESTRICT U2W,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUW) {	// 0.5 * U[ijk-1] * U[ijk] * (W[i(+1)jk] + W[i-1(-2)jk])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( U2W ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( U2W )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					U2W[idx] = (T) 0.5 * U[idx - 1] * U[idx] *
+					(C1 * (W[idx] + W[idx - grid.nyz]) -
+					_3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz]));
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = V * V * W [node: VW]
+void nse::v2w_product_x4(T* _RESTRICT V2W,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeVW) {	// 0.5 * V[ijk-1] * V[ijk] * (W[ij(+1)k] + W[ij-1(-2)k])
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( V2W ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( V2W )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					V2W[idx] = (T) 0.5 * V[idx - 1] * V[idx] *
+					(C1 * (W[idx] + W[idx - grid.nz]) -
+					_3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz]));
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = U * W * W [node: U]
+void nse::uww_product_x4(T* _RESTRICT UWW,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+
+	if (node == nodeU) {
+		//         __________3x  ___1z
+		//         ___1z  ___1z  ___1z
+		// 1/2 * [  W   *  W   *  U    ] +
+		//
+		//         __________1z  ___1z
+		//         ___3x  ___1z  ___3x
+		// 1/2 * [  W   *  U   *  W    ]
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UWW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UWW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					UWW[idx] =
+						(T)0.25 * (
+						
+						(T)0.25 * C1 * (
+						(W[idx] + W[idx + 1]) * (W[idx] + W[idx + 1]) +
+						(W[idx - grid.nyz] + W[idx - grid.nyz + 1]) * (W[idx - grid.nyz] + W[idx - grid.nyz + 1])) 
+						-
+						(T)0.25 * _3C2 * (
+						(W[idx + grid.nyz] + W[idx + grid.nyz + 1]) * (W[idx + grid.nyz] + W[idx + grid.nyz + 1]) +
+						(W[idx - 2 * grid.nyz] + W[idx - 2 * grid.nyz + 1]) * (W[idx - 2 * grid.nyz] + W[idx - 2 * grid.nyz + 1]))
+						
+						) * (T)0.25 * (U[idx + 1] + (T)2.0 * U[idx] + U[idx - 1])
+
+						+
+
+						(T)0.25 * (
+						(T)0.25 * 
+						(C1 * (W[idx + 1] + W[idx - grid.nyz + 1]) - _3C2 * (W[idx + grid.nyz + 1] + W[idx - 2 * grid.nyz + 1])) *
+						(U[idx] + U[idx + 1]) 
+						+
+						(T)0.25 * 
+						(C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz])) *
+						(U[idx] + U[idx - 1])
+						) * (T)0.25 * 
+						(C1 * (W[idx] + W[idx + 1] + W[idx - grid.nyz] + W[idx - grid.nyz + 1]) -
+						_3C2 * (W[idx + grid.nyz] + W[idx + grid.nyz + 1] + W[idx - 2 * grid.nyz] + W[idx - 2 * grid.nyz + 1]));
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = V * W * W [node: V]
+void nse::vww_product_x4(T* _RESTRICT VWW,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeV) {
+		//         __________3y  ___1z
+		//         ___1z  ___1z  ___1z
+		// 1/2 * [  W   *  W   *  V    ] +
+		//
+		//         __________1z  ___1z
+		//         ___3y  ___1z  ___3y
+		// 1/2 * [  W   *  V   *  W    ]
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VWW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VWW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					VWW[idx] =
+						(T)0.25 * (
+
+						(T)0.25 * C1 * (
+						(W[idx] + W[idx + 1]) * (W[idx] + W[idx + 1]) +
+						(W[idx - grid.nz] + W[idx - grid.nz + 1]) * (W[idx - grid.nz] + W[idx - grid.nz + 1])) 
+						-
+						(T)0.25 * _3C2 * (
+						(W[idx + grid.nz] + W[idx + grid.nz + 1]) * (W[idx + grid.nz] + W[idx + grid.nz + 1]) +
+						(W[idx - 2 * grid.nz] + W[idx - 2 * grid.nz + 1]) * (W[idx - 2 * grid.nz] + W[idx - 2 * grid.nz + 1]))
+						
+						) * (T)0.25 * (V[idx + 1] + (T)2.0 * V[idx] + V[idx - 1])
+
+						+
+
+						(T)0.25 * (
+						(T)0.25 * 
+						(C1 * (W[idx + 1] + W[idx - grid.nz + 1]) - _3C2 * (W[idx + grid.nz + 1] + W[idx - 2 * grid.nz + 1])) *
+						(V[idx] + V[idx + 1]) 
+						+
+						(T)0.25 * 
+						(C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz])) *
+						(V[idx] + V[idx - 1])
+						) * (T)0.25 * 
+						(C1 * (W[idx] + W[idx + 1] + W[idx - grid.nz] + W[idx - grid.nz + 1]) -
+						_3C2 * (W[idx + grid.nz] + W[idx + grid.nz + 1] + W[idx - 2 * grid.nz] + W[idx - 2 * grid.nz + 1]));
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = U * V * W [node: UVW]
+void nse::uvw_product_x4(T* _RESTRICT UVW,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUVW) {
+		//         __________3x  ___1z
+		//         ___1z  ___3y  ___3y
+		// 1/2 * [  V   *  W   *  U    ] +
+		//
+		//         __________3y  ___1z
+		//         ___1z  ___3x  ___3x
+		// 1/2 * [  U   *  W   *  V    ]
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UVW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UVW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+					UVW[idx] =
+						(T)0.5 * (
+						(T)0.125 * (
+						C1 * (
+						(C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz])) * 
+						(V[idx] + V[idx - 1]) +
+						(C1 * (W[idx - grid.nyz] + W[idx - grid.nyz - grid.nz]) - _3C2 * (W[idx - grid.nyz + grid.nz] + W[idx - grid.nyz - 2 * grid.nz])) * 
+						(V[idx - grid.nyz] + V[idx - grid.nyz - 1])
+						) - 
+						_3C2 *
+						(
+						(C1 * (W[idx + grid.nyz] + W[idx + grid.nyz - grid.nz]) - _3C2 * (W[idx + grid.nyz + grid.nz] + W[idx + grid.nyz - 2 * grid.nz])) * 
+						(V[idx + grid.nyz] + V[idx + grid.nyz - 1]) +
+						(C1 * (W[idx - 2 * grid.nyz] + W[idx - 2 * grid.nyz - grid.nz]) - _3C2 * (W[idx - 2 * grid.nyz + grid.nz] + W[idx - 2 * grid.nyz - 2 * grid.nz])) * 
+						(V[idx - 2 * grid.nyz] + V[idx - 2 * grid.nyz - 1])
+						)
+						) *
+						(T)0.25 * (
+						C1 * (U[idx] + U[idx - grid.nz] + U[idx - 1] + U[idx - grid.nz - 1]) - 
+						_3C2 * (U[idx + grid.nz] + U[idx - 2 * grid.nz] + U[idx + grid.nz - 1] + U[idx - 2 * grid.nz - 1]))
+						)
+
+						+
+
+						(T)0.5 * (
+						(T)0.125 * (
+						C1 * (
+						(C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz])) * 
+						(U[idx] + U[idx - 1]) +
+						(C1 * (W[idx - grid.nz] + W[idx - grid.nyz - grid.nz]) - _3C2 * (W[idx + grid.nyz - grid.nz] + W[idx - 2 * grid.nyz - grid.nz])) * 
+						(U[idx - grid.nz] + U[idx - grid.nz - 1])
+						) -
+						_3C2 * (
+						(C1 * (W[idx + grid.nz] + W[idx - grid.nyz + grid.nz]) - _3C2 * (W[idx + grid.nyz + grid.nz] + W[idx - 2 * grid.nyz + grid.nz])) *
+						(U[idx + grid.nz] + U[idx + grid.nz - 1]) +
+						(C1 * (W[idx - 2 * grid.nz] + W[idx - grid.nyz - 2 * grid.nz]) - _3C2 * (W[idx + grid.nyz - 2 * grid.nz] + W[idx - 2 * grid.nyz - 2 * grid.nz])) *
+						(U[idx - 2 * grid.nz] + U[idx - 2 * grid.nz - 1])
+						)
+						) *
+						(T)0.25 * (
+						C1 * (V[idx] + V[idx - grid.nyz] + V[idx - 1] + V[idx - grid.nyz - 1]) - 
+						_3C2 * (V[idx + grid.nyz] + V[idx - 2 * grid.nyz] + V[idx + grid.nyz - 1] + V[idx - 2 * grid.nyz - 1]))
+						);
+				}
+			}
+		}
+		return;
+	}
+}
+
+
+template< typename T >	// = C * U * W [node: UW]
+void nse::cuw_product_x4(T* _RESTRICT CUW,
+	const T* _RESTRICT const C,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUW) {
+		//                       ___1z    _______3x
+		//       ___1z    ___3x  ___3x        ___1z
+		// 1/2 *  U   * [  W   *  C    +  W *  C    ]
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CUW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CUW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CUW[idx] = (T)0.25 * (U[idx] + U[idx - 1]) * (
+
+					(T)0.125 * (C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz])) *
+					(C1 * (C[idx] + C[idx - 1] + C[idx - grid.nyz] + C[idx - grid.nyz - 1]) -
+					_3C2 * (C[idx + grid.nyz] + C[idx + grid.nyz - 1] + C[idx - 2 * grid.nyz] + C[idx - 2 * grid.nyz - 1])) +
+
+					(T)0.25 * (
+					C1 * (
+					W[idx] * (C[idx] + C[idx - 1]) + 
+					W[idx - grid.nyz] * (C[idx - grid.nyz] + C[idx - grid.nyz - 1])) 
+					-
+					_3C2 * (
+					W[idx + grid.nyz] * (C[idx + grid.nyz] + C[idx + grid.nyz - 1]) + 
+					W[idx - 2 * grid.nyz] * (C[idx - 2 * grid.nyz] + C[idx - 2 * grid.nyz - 1]))
+					)
+					
+					);
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = C * V * W [node: VW]
+void nse::cvw_product_x4(T* _RESTRICT CVW,
+	const T* _RESTRICT const C,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeVW) {
+		//                       ___1z    _______3y
+		//       ___1z    ___3y  ___3y        ___1z
+		// 1/2 *  V   * [  W   *  C    +  W *  C    ]
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CVW ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CVW )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+					CVW[idx] = (T)0.25 * (V[idx] + V[idx - 1]) * (
+
+					(T)0.125 * (C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz])) *
+					(C1 * (C[idx] + C[idx - 1] + C[idx - grid.nz] + C[idx - grid.nz - 1]) - 
+					_3C2 * (C[idx + grid.nz] + C[idx + grid.nz - 1] + C[idx - 2 * grid.nz] + C[idx - 2 * grid.nz - 1])) +
+
+					(T)0.25 * (
+					C1 * (
+					W[idx] * (C[idx] + C[idx - 1]) + 
+					W[idx - grid.nz] * (C[idx - grid.nz] + C[idx - grid.nz - 1])) 
+					- 
+					_3C2 * (
+					W[idx + grid.nz] * (C[idx + grid.nz] + C[idx + grid.nz - 1]) + 
+					W[idx - 2 * grid.nz] * (C[idx - 2 * grid.nz] + C[idx - 2 * grid.nz - 1]))
+					)
+
+					);
+			}
+		}
+		return;
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * product partitions * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// = U * W [node: between UW and U]
+void nse::uw_product_partition_x4(T* _RESTRICT UW_Bottom, T* _RESTRICT UW_Top,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeU) {
+		// 1/2 * U[ijk] * (W[i(+1)jk] + W[i-1(-2)jk])
+		// 1/2 * U[ijk] * (W[i(+1)jk+1] + W[i-1(-2)jk+1])
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW_Bottom, UW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW_Bottom, UW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					UW_Bottom[idx] = (T) 0.5 * U[idx] *
+						(C1 * (W[idx - grid.nyz] + W[idx]) -
+						_3C2 * (W[idx - 2 * grid.nyz] + W[idx + grid.nyz]));
+					UW_Top[idx] = (T) 0.5 * U[idx] *
+						(C1 * (W[idx - grid.nyz + 1] + W[idx + 1]) -
+						_3C2 * (W[idx - 2 * grid.nyz + 1] + W[idx + grid.nyz + 1]));
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUV) {
+		// 1/8 * (U[ij(+1)k] + U[ij-1(-2)k]) * 
+		//		[(W[ijk] + W[i-1jk] + W[ij-1k] + W[i-1j-1k])] - 4th order approximation
+		// 1/8 * (U[ij(+1)k] + U[ij-1(-2)k]) * 
+		//		[(W[ijk+1] + W[i-1jk+1] + W[ij-1k+1] + W[i-1j-1k+1])] - 4th order approximation
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW_Bottom, UW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW_Bottom, UW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					UW_Bottom[idx] = (T)0.125 * (C1 * (U[idx] + U[idx - grid.nz]) - _3C2 * (U[idx + grid.nz] + U[idx - 2 * grid.nz])) *
+						(
+						C1 * C1 *
+						(W[idx] + W[idx - grid.nyz] + W[idx - grid.nz] + W[idx - grid.nyz - grid.nz]) -
+						C1 * _3C2 * (
+						W[idx + grid.nyz] + W[idx - 2 * grid.nyz] + W[idx + grid.nyz - grid.nz] + W[idx - 2 * grid.nyz - grid.nz] +
+						W[idx + grid.nz] + W[idx - grid.nyz + grid.nz] + W[idx - 2 * grid.nz] + W[idx - grid.nyz - 2 * grid.nz]
+						) +
+						_3C2 * _3C2 * (
+						W[idx + grid.nyz + grid.nz] + W[idx - 2 * grid.nyz + grid.nz] +
+						W[idx + grid.nyz - 2 * grid.nz] + W[idx - 2 * grid.nyz - 2 * grid.nz])
+						);
+
+					UW_Top[idx] = (T)0.125 * (C1 * (U[idx] + U[idx - grid.nz]) - _3C2 * (U[idx + grid.nz] + U[idx - 2 * grid.nz])) *
+						(
+						C1 * C1 *
+						(W[idx + 1] + W[idx - grid.nyz + 1] + W[idx - grid.nz + 1] + W[idx - grid.nyz - grid.nz + 1]) -
+						C1 * _3C2 * (
+						W[idx + grid.nyz + 1] + W[idx - 2 * grid.nyz + 1] + W[idx + grid.nyz - grid.nz + 1] + W[idx - 2 * grid.nyz - grid.nz + 1] +
+						W[idx + grid.nz + 1] + W[idx - grid.nyz + grid.nz + 1] + W[idx - 2 * grid.nz + 1] + W[idx - grid.nyz - 2 * grid.nz + 1]
+						) +
+						_3C2 * _3C2 * (
+						W[idx + grid.nyz + grid.nz + 1] + W[idx - 2 * grid.nyz + grid.nz + 1] +
+						W[idx + grid.nyz - 2 * grid.nz + 1] + W[idx - 2 * grid.nyz - 2 * grid.nz + 1])
+						);
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUW) {
+		// 1/8 * (U[ijk] + U[ijk-1]) * 
+		//			[(W[ijk] + W[i-1jk] + W[ijk-1] + W[i-1jk-1])] - 4th order approximation
+		// 1/8 * (U[ijk] + U[ijk-1]) * 
+		//			[(W[ijk] + W[i-1jk] + W[ijk+1] + W[i-1jk+1])] - 4th order approximation
+		//
+		// U*W-[bottom] at (gcz + 1, nz - gcz) nodes
+		// U*W-[top] at (gcz, nz - gcz - 1) nodes
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW_Bottom, UW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW_Bottom, UW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+					UW_Bottom[idx] = (T)0.125 * (U[idx] + U[idx - 1]) *
+						(C1 * (W[idx] + W[idx - grid.nyz] + W[idx - 1] + W[idx - grid.nyz - 1]) - 
+						_3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz] + W[idx + grid.nyz - 1] + W[idx - 2 * grid.nyz - 1]));
+					UW_Top[idx] = (T)0.125 * (U[idx] + U[idx - 1]) *
+						(C1 * (W[idx] + W[idx - grid.nyz] + W[idx + 1] + W[idx - grid.nyz + 1]) - 
+						_3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz] + W[idx + grid.nyz + 1] + W[idx - 2 * grid.nyz + 1]));
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = V * W [node: between VW and V]
+void nse::vw_product_partition_x4(T* _RESTRICT VW_Bottom, T* _RESTRICT VW_Top,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeV) {
+		// 1/2 * V[ijk] * (W[ij(+1)k] + W[ij-1(-2)k])
+		// 1/2 * V[ijk] * (W[ij(+1)k+1] + W[ij-1(-2)k+1])
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW_Bottom, VW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW_Bottom, VW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					VW_Bottom[idx] = (T) 0.5 * V[idx] * 
+						(C1 * (W[idx - grid.nz] + W[idx]) - _3C2 * (W[idx - 2 * grid.nz] + W[idx + grid.nz]));
+					VW_Top[idx] = (T) 0.5 * V[idx] * 
+						(C1 * (W[idx - grid.nz + 1] + W[idx + 1]) - _3C2 * (W[idx - 2 * grid.nz + 1] + W[idx + grid.nz + 1]));
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeUV) {
+		// 1/8 * (V[i(+1)jk] + V[i-1(-2)jk]) * 
+		//		[(W[ijk] + W[i-1jk] + W[ij-1k] + W[i-1j-1k])] - 4th order approximation
+		// 1/8 * (V[i(+1)jk] + V[i-1(-2)jk]) * 
+		//		[(W[ijk+1] + W[i-1jk+1] + W[ij-1k+1] + W[i-1j-1k+1])] - 4th order approximation
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW_Bottom, VW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW_Bottom, VW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					VW_Bottom[idx] = (T)0.125 * (C1 * (V[idx] + V[idx - grid.nyz]) - _3C2 * (V[idx + grid.nyz] + V[idx - 2 * grid.nyz])) *
+						(
+						C1 * C1 *
+						(W[idx] + W[idx - grid.nyz] + W[idx - grid.nz] + W[idx - grid.nyz - grid.nz]) -
+						C1 * _3C2 * (
+						W[idx + grid.nyz] + W[idx - 2 * grid.nyz] + W[idx + grid.nyz - grid.nz] + W[idx - 2 * grid.nyz - grid.nz] +
+						W[idx + grid.nz] + W[idx - grid.nyz + grid.nz] + W[idx - 2 * grid.nz] + W[idx - grid.nyz - 2 * grid.nz]
+						) +
+						_3C2 * _3C2 * (
+						W[idx + grid.nyz + grid.nz] + W[idx - 2 * grid.nyz + grid.nz] +
+						W[idx + grid.nyz - 2 * grid.nz] + W[idx - 2 * grid.nyz - 2 * grid.nz])
+						);
+
+					VW_Top[idx] = (T)0.125 * (C1 * (V[idx] + V[idx - grid.nyz]) - _3C2 * (V[idx + grid.nyz] + V[idx - 2 * grid.nyz])) *
+						(
+						C1 * C1 *
+						(W[idx + 1] + W[idx - grid.nyz + 1] + W[idx - grid.nz + 1] + W[idx - grid.nyz - grid.nz + 1]) -
+						C1 * _3C2 * (
+						W[idx + grid.nyz + 1] + W[idx - 2 * grid.nyz + 1] + W[idx + grid.nyz - grid.nz + 1] + W[idx - 2 * grid.nyz - grid.nz + 1] +
+						W[idx + grid.nz + 1] + W[idx - grid.nyz + grid.nz + 1] + W[idx - 2 * grid.nz + 1] + W[idx - grid.nyz - 2 * grid.nz + 1]
+						) +
+						_3C2 * _3C2 * (
+						W[idx + grid.nyz + grid.nz + 1] + W[idx - 2 * grid.nyz + grid.nz + 1] +
+						W[idx + grid.nyz - 2 * grid.nz + 1] + W[idx - 2 * grid.nyz - 2 * grid.nz + 1])
+						);
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeVW) {
+		// 1/8 * (V[ijk] + V[ijk-1]) * 
+		//			[(W[ijk] + W[ij-1k] + W[ijk-1] + W[ij-1k-1])] - 4th order approximation
+		// 1/8 * (V[ijk] + V[ijk-1]) * 
+		//			[(W[ijk] + W[ij-1k] + W[ijk+1] + W[ij-1k+1])] - 4th order approximation
+		//
+		// V*W-[bottom] at (gcz + 1, nz - gcz) nodes
+		// V*W-[top] at (gcz, nz - gcz - 1) nodes
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW_Bottom, VW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW_Bottom, VW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+					VW_Bottom[idx] = (T)0.125 * (V[idx] + V[idx - 1]) *
+						(C1 * (W[idx] + W[idx - grid.nz] + W[idx - 1] + W[idx - grid.nz - 1]) - 
+						_3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz] + W[idx + grid.nz - 1] + W[idx - 2 * grid.nz - 1]));
+					VW_Top[idx] = (T)0.125 * (V[idx] + V[idx - 1]) *
+						(C1 * (W[idx] + W[idx - grid.nz] + W[idx + 1] + W[idx - grid.nz + 1]) - 
+						_3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz] + W[idx + grid.nz + 1] + W[idx - 2 * grid.nz + 1]));
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = C * W [node: between W and C]
+void nse::cw_product_partition_x4(T* _RESTRICT CW_Bottom, T* _RESTRICT CW_Top,
+	const T* _RESTRICT const C, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeC) {
+		// C[ijk] * W[ijk]
+		// C[ijk] * W[ijk+1]
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW_Bottom, CW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW_Bottom, CW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					CW_Bottom[idx] = C[idx] * W[idx];
+					CW_Top[idx] = C[idx] * W[idx + 1];
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeU) {
+		// 1/4 * (C[i(+1)jk] + C[i-1(-2)jk]) * (W[i(+1)jk] + W[i-1(-2)jk])
+		// 1/4 * (C[i(+1)jk] + C[i-1(-2)jk]) * (W[i(+1)jk+1] + W[i-1(-2)jk+1])
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW_Bottom, CW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW_Bottom, CW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					CW_Bottom[idx] = (T)0.25 * 
+						(C1 * (C[idx] + C[idx - grid.nyz]) - _3C2 * (C[idx + grid.nyz] + C[idx - 2 * grid.nyz])) *
+						(C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz]));
+					CW_Top[idx] = (T)0.25 * 
+						(C1 * (C[idx] + C[idx - grid.nyz]) - _3C2 * (C[idx + grid.nyz] + C[idx - 2 * grid.nyz])) *
+						(C1 * (W[idx + 1] + W[idx - grid.nyz + 1]) - _3C2 * (W[idx + grid.nyz + 1] + W[idx - 2 * grid.nyz + 1]));
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeV) {
+		// 1/4 * (C[ij(+1)k] + C[ij-1(-2)k]) * (W[ij(+1)k] + W[ij-1(-2)k])
+		// 1/4 * (C[ij(+1)k] + C[ij-1(-2)k]) * (W[ij(+1)k+1] + W[ij-1(-2)k+1])
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW_Bottom, CW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW_Bottom, CW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					CW_Bottom[idx] = (T)0.25 * 
+						(C1 * (C[idx] + C[idx - grid.nz]) - _3C2 * (C[idx + grid.nz] + C[idx - 2 * grid.nz])) *
+						(C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz]));
+					CW_Top[idx] = (T)0.25 * 
+						(C1 * (C[idx] + C[idx - grid.nz]) - _3C2 * (C[idx + grid.nz] + C[idx - 2 * grid.nz])) *
+						(C1 * (W[idx + 1] + W[idx - grid.nz + 1]) - _3C2 * (W[idx + grid.nz + 1] + W[idx - 2 * grid.nz + 1]));
+				}
+			}
+		}
+		return;
+	}
+
+	if (node == nodeW) {
+		// 1/4 * (C[ijk] + C[ijk-1]) * (W[ijk] + W[ijk-1])
+		// 1/4 * (C[ijk] + C[ijk-1]) * (W[ijk] + W[ijk+1])
+		//
+		// C*W-[bottom] at (gcz + 1, nz - gcz) nodes
+		// C*W-[top] at (gcz, nz - gcz - 1) nodes
+		//
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW_Bottom, CW_Top ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW_Bottom, CW_Top )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+					CW_Bottom[idx] = (T)0.25 * (C[idx] + C[idx - 1]) * (W[idx] + W[idx - 1]);
+					CW_Top[idx] = (T)0.25 * (C[idx] + C[idx - 1]) * (W[idx] + W[idx + 1]);
+				}
+			}
+		}
+		return;
+	}
+}
+// ------------------------------------------------------------------------ //
+
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+
+// * initialize: node-add(sub) routines * //
+template void nse::c_add_x4(float* _RESTRICT X, const float* const Q,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::c_add_x4(double* _RESTRICT X, const double* const Q,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::c_sub_x4(float* _RESTRICT X, const float* const Q,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::c_sub_x4(double* _RESTRICT X, const double* const Q,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * initialize: field products * //
+template void nse::w_square_x4(float* _RESTRICT W2, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::w_square_x4(double* _RESTRICT W2, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::uv_product_x4(float* _RESTRICT UV,
+	const float* _RESTRICT const U, const float* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uv_product_x4(double* _RESTRICT UV,
+	const double* _RESTRICT const U, const double* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::uw_product_x4(float* _RESTRICT UW,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uw_product_x4(double* _RESTRICT UW,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::vw_product_x4(float* _RESTRICT VW,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::vw_product_x4(double* _RESTRICT VW,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cu_product_x4(float* _RESTRICT CU,
+	const float* _RESTRICT const C, const float* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cu_product_x4(double* _RESTRICT CU,
+	const double* _RESTRICT const C, const double* _RESTRICT const U,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cv_product_x4(float* _RESTRICT CV,
+	const float* _RESTRICT const C, const float* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cv_product_x4(double* _RESTRICT CV,
+	const double* _RESTRICT const C, const double* _RESTRICT const V,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cw_product_x4(float* _RESTRICT CW,
+	const float* _RESTRICT const C, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cw_product_x4(double* _RESTRICT CW,
+	const double* _RESTRICT const C, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: square(field) - field products * //
+template void nse::u2w_product_x4(float* _RESTRICT U2W,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::u2w_product_x4(double* _RESTRICT U2W,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::v2w_product_x4(float* _RESTRICT V2W,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::v2w_product_x4(double* _RESTRICT V2W,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::uww_product_x4(float* _RESTRICT UWW,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uww_product_x4(double* _RESTRICT UWW,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::vww_product_x4(float* _RESTRICT VWW,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::vww_product_x4(double* _RESTRICT VWW,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::uvw_product_x4(float* _RESTRICT UVW,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uvw_product_x4(double* _RESTRICT UVW,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+
+template void nse::cuw_product_x4(float* _RESTRICT CUW,
+	const float* _RESTRICT const C,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cuw_product_x4(double* _RESTRICT CUW,
+	const double* _RESTRICT const C,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cvw_product_x4(float* _RESTRICT CVW,
+	const float* _RESTRICT const C,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cvw_product_x4(double* _RESTRICT CVW,
+	const double* _RESTRICT const C,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: product partitions * //
+template void nse::uw_product_partition_x4(float* _RESTRICT UW_Bottom, float* _RESTRICT UW_Top,
+	const float* _RESTRICT const U, const float* _RESTRICT const W, 
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::uw_product_partition_x4(double* _RESTRICT UW_Bottom, double* _RESTRICT UW_Top,
+	const double* _RESTRICT const U, const double* _RESTRICT const W, 
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::vw_product_partition_x4(float* _RESTRICT VW_Bottom, float* _RESTRICT VW_Top,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::vw_product_partition_x4(double* _RESTRICT VW_Bottom, double* _RESTRICT VW_Top,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template void nse::cw_product_partition_x4(float* _RESTRICT CW_Bottom, float* _RESTRICT CW_Top,
+	const float* _RESTRICT const C, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template void nse::cw_product_partition_x4(double* _RESTRICT CW_Bottom, double* _RESTRICT CW_Top,
+	const double* _RESTRICT const C, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
diff --git a/nse-fops3d-x4.h b/nse-fops3d-x4.h
new file mode 100644
index 0000000000000000000000000000000000000000..de3f40ee006541696ba351b318e1a4271edadaaf
--- /dev/null
+++ b/nse-fops3d-x4.h
@@ -0,0 +1,107 @@
+#pragma once
+
+// [nse-fops3d-x4.h(cpp)]: 3D Navier-Stokes module (field operations) -X4
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "grid3d.h"
+
+
+namespace nse
+{
+	// * node-add(sub) routines (full -x4) * //
+	template< typename T > // X = X + Q [Q = U, V, W, C]
+	void c_add_x4(T* _RESTRICT X, const T* _RESTRICT const Q,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T > // X = X - Q [Q = U, V, W, C]
+	void c_sub_x4(T* _RESTRICT X, const T* _RESTRICT const Q,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * field products (horizontal -x4, vertical -x2) * //
+	template< typename T >	// = W * W [node: C, U, V, W, UW, VW]
+	void w_square_x4(T* _RESTRICT W2, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = U * V [node: C, UV, UVW]
+	void uv_product_x4(T* _RESTRICT UV,
+		const T* _RESTRICT const U, const T* _RESTRICT const V,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = U * W [node: C, UW, UVW]
+	void uw_product_x4(T* _RESTRICT UW,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = V * W [node: C, VW, UVW]
+	void vw_product_x4(T* _RESTRICT VW,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T > // = C * U [node: C, U, UW]
+	void cu_product_x4(T* _RESTRICT CU,
+		const T* _RESTRICT const C, const T* _RESTRICT const U,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T > // = C * V [node: C, V, VW]
+	void cv_product_x4(T* _RESTRICT CV,
+		const T* _RESTRICT const C, const T* _RESTRICT const V,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T > // = C * W [node: C, W, UW, VW]
+	void cw_product_x4(T* _RESTRICT CW,
+		const T* _RESTRICT const C, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+	// * square(field) - field products (horizontal -x4, vertical -x2) * //
+	template< typename T >	// = U * U * W [node: UW]
+	void u2w_product_x4(T* _RESTRICT U2W,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = V * V * W [node: VW]
+	void v2w_product_x4(T* _RESTRICT V2W,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = U * W * W [node: U]
+	void uww_product_x4(T* _RESTRICT UWW,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = V * W * W [node: V]
+	void vww_product_x4(T* _RESTRICT VWW,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = U * V * W [node: UVW]
+	void uvw_product_x4(T* _RESTRICT UVW,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+
+	template< typename T >	// = C * U * W [node: UW]
+	void cuw_product_x4(T* _RESTRICT CUW,
+		const T* _RESTRICT const C,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	template< typename T >	// = C * V * W [node: VW]
+	void cvw_product_x4(T* _RESTRICT CVW,
+		const T* _RESTRICT const C,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+	// * product partitions (horizontal -x4, vertical -x2) * //
+	template< typename T >	// = U * W [node: U (UW -- U), UV (UVW -- UV), UW (U -- UW)]
+	void uw_product_partition_x4(T* _RESTRICT UW_Bottom, T* _RESTRICT UW_Top,
+		const T* _RESTRICT const U, const T* _RESTRICT const W, 
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = V * W [node: V (VW -- V), UV (UVW -- UV), VW (V -- VW)]
+	void vw_product_partition_x4(T* _RESTRICT VW_Bottom, T* _RESTRICT VW_Top,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+
+	template< typename T >	// = C * W [node: C (W -- C), U (UW -- U), V (VW -- V), W (C -- W)]
+	void cw_product_partition_x4(T* _RESTRICT CW_Bottom, T* _RESTRICT CW_Top,
+		const T* _RESTRICT const C, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const Grid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+}
diff --git a/nse-fourier-vec.h b/nse-fourier-vec.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d5f69efc3d9d37413c3155ec2f11b5e3998aefe
--- /dev/null
+++ b/nse-fourier-vec.h
@@ -0,0 +1,68 @@
+#pragma once
+
+// [nse-fourier-vec.h]: FFT transforms container
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+#include "grid3d.h"
+
+template< typename T >
+struct nseFourierVec
+{
+	T *U2_xz;		// U^2(x,z) averaged over -y: [U]	[local array]
+	T *U2_x;		// U^2(x) at specific -z: [U]		[local array]
+
+	T *DFT_U2_x[4];			// Fourier transform of U^2(x) at specific z: [real,implicit,modulus]	[global array]
+	T *DFT_wavenumber_x;	// Wave-number for x component											[global array]
+
+	bool status;		// allocation status, default: [false]
+
+
+	void init(const nse::Grid3d< T >& grid);
+	void clear();
+
+	nseFourierVec();
+	~nseFourierVec();
+};
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+nseFourierVec< T >::nseFourierVec() : status(false) {}
+template< typename T >
+nseFourierVec< T >::~nseFourierVec() { clear(); }
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nseFourierVec< T >::init(const nse::Grid3d< T >& grid)
+{
+	nse::allocate_vnull(&U2_xz, grid.nx * grid.nz);
+	nse::allocate_vnull(&U2_x, grid.mpi_nx);
+	nse::allocate_vnull(&DFT_U2_x[0], grid.mpi_nx);
+	nse::allocate_vnull(&DFT_U2_x[1], grid.mpi_nx);
+	nse::allocate_vnull(&DFT_U2_x[2], grid.mpi_nx);
+	nse::allocate_vnull(&DFT_U2_x[3], grid.mpi_nx);
+	nse::allocate_vnull(&DFT_wavenumber_x, grid.mpi_nx);
+
+	status = true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nseFourierVec< T >::clear()
+{
+	if (status) {
+		nse::deallocate(U2_xz);
+		nse::deallocate(U2_x);
+		nse::deallocate(DFT_U2_x[0]);
+		nse::deallocate(DFT_U2_x[1]);
+		nse::deallocate(DFT_U2_x[2]);
+		nse::deallocate(DFT_U2_x[3]);
+		nse::deallocate(DFT_wavenumber_x);
+
+		status = false;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/nse-generic3d.cpp b/nse-generic3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd369024f9d3397be9d4976e1b396aba8d4ed325
--- /dev/null
+++ b/nse-generic3d.cpp
@@ -0,0 +1,1831 @@
+#define _CRT_SECURE_NO_DEPRECATE
+#include "nse-generic3d.h"
+
+#include "mtrand.h"
+#include "vecmath.h"
+#include "io-base1d.h"
+#include <math.h>
+
+using namespace nse::nse_const3d;
+
+// * Adams-Bashforth time advancement * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::adams_bashforth_x2(
+	T*  _RESTRICT X, T*  _RESTRICT Xp,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	T C;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx, C ) shared( X, Xp ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx, C ) shared( X, Xp )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+				C = (T) 1.5 * X[idx] - (T) 0.5 * Xp[idx];
+				Xp[idx] = X[idx];
+				X[idx] = C;
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::adams_bashforth_x2(
+	T*  _RESTRICT X, T*  _RESTRICT Xp,
+	const Grid3d< T >& grid, const T dt, const T p_dt)
+{
+	T C;
+	int i;
+
+	const T c_dt = (T) 0.5 * dt / p_dt;
+
+#pragma omp parallel for private( i, C ) shared( Xp, X )
+	for (i = 0; i < grid.size; i++) {
+		C = X[i] + c_dt * (X[i] - Xp[i]);
+		Xp[i] = X[i]; X[i] = C;
+	}
+}
+
+template< typename T >
+void nse::adams_bashforth_x2(
+	T*  _RESTRICT X, T*  _RESTRICT Xp,
+	const T eps, const Grid3d< T >& grid)
+{
+	T C;
+	int i;
+
+#pragma omp parallel for private( i, C ) shared( Xp, X )
+	for (i = 0; i < grid.size; i++) {
+		C = ((T) 1.5 + eps) * X[i] - ((T) 0.5 + eps) * Xp[i];
+		Xp[i] = X[i]; X[i] = C;
+	}
+}
+
+template< typename T >
+void nse::adams_bashforth_x3(
+	T*  _RESTRICT X, T*  _RESTRICT Xp, T*  _RESTRICT Xpp,
+	const Grid3d< T >& grid)
+{
+	T C;
+	int i;
+
+#pragma omp parallel for private( i, C ) shared( Xpp, Xp, X )
+	for (i = 0; i < grid.size; i++) {
+		C = ((T) 23.0 / (T) 12.0) * X[i] -
+			((T) 4.0 / (T) 3.0) * Xp[i] +
+			((T) 5.0 / (T) 12.0) * Xpp[i];
+
+		Xpp[i] = Xp[i];
+		Xp[i] = X[i];
+		X[i] = C;
+	}
+}
+
+template< typename T >
+void nse::adams_bashforth_x3(
+	T*  _RESTRICT X, T*  _RESTRICT Xp, T*  _RESTRICT Xpp,
+	const Grid3d< T >& grid, const T dt, const T p_dt, const T pp_dt)
+{
+	const T alpha = (T) 1.0 +
+		((T) 1.0 / (T) 6.0) *
+		(dt * ((T) 2.0 * dt + (T) 3.0 * pp_dt + (T) 6.0 * p_dt)) / (p_dt * (p_dt + pp_dt));
+	const T beta = ((T) 1.0 / (T) 6.0) *
+		(dt * ((T) 2.0 * dt + (T) 3.0 * pp_dt + (T) 3.0 * p_dt)) / (pp_dt * p_dt);
+	const T gamma = ((T) 1.0 / (T) 6.0) *
+		(dt * ((T) 2.0 * dt + (T) 3.0 * p_dt)) / (pp_dt * (p_dt + pp_dt));
+
+	T C;
+	int i;
+
+#pragma omp parallel for private( i, C ) shared( Xpp, Xp, X )
+	for (i = 0; i < grid.size; i++) {
+		C = alpha * X[i] - beta * Xp[i] + gamma * Xpp[i];
+
+		Xpp[i] = Xp[i];
+		Xp[i] = X[i];
+		X[i] = C;
+	}
+}
+
+template< typename T >
+void nse::adams_bashforth_x2(
+	T*  _RESTRICT X, T*  _RESTRICT Xp,
+	const int* _RESTRICT const mask,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	T C;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx, C ) shared( X, Xp ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx, C ) shared( X, Xp )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) 
+			{
+				if (mask[idx] == solidCell) continue;
+
+				C = (T) 1.5 * X[idx] - (T) 0.5 * Xp[idx];
+				Xp[idx] = X[idx];
+				X[idx] = C;
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * Scalar min-max * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::scalar_min_max(T*  _RESTRICT fmin, T*  _RESTRICT fmax,
+	const T*  _RESTRICT const F, const Grid3d< T >& grid)
+{
+	int i, j, k, shidx, idx;
+	T _min = F[grid.gcx * grid.nyz + grid.gcy * grid.nz + grid.gcz], 
+		_max = F[grid.gcx * grid.nyz + grid.gcy * grid.nz + grid.gcz];
+
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+
+	T _min_local, _max_local;
+
+#pragma omp parallel private(i, j, k, shidx, idx, _min_local, _max_local) shared(_min, _max)
+	{
+		_min_local = _min;
+		_max_local = _max;
+
+#pragma omp for nowait
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					if (F[idx] > _max_local) _max_local = F[idx];
+					if (F[idx] < _min_local) _min_local = F[idx];
+				}
+			}
+		}
+
+#pragma omp critical
+		{
+			if (_max_local > _max) _max = _max_local;
+			if (_min_local < _min) _min = _min_local;
+		}
+	}
+
+#else
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private(i, j, k, idx) reduction(max: _max) reduction(min: _min) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i, j, k, shidx, idx) reduction(max: _max) reduction(min: _min)
+#endif
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				if (F[idx] > _max) _max = F[idx];
+				if (F[idx] < _min) _min = F[idx];
+			}
+		}
+	}
+
+#endif
+
+	mpi_allreduce(&_max, MPI_MAX, grid.mpi_com.comm);
+	mpi_allreduce(&_min, MPI_MIN, grid.mpi_com.comm);
+
+	(*fmax) = _max;
+	(*fmin) = _min;
+}
+// ------------------------------------------------------------------------ //
+
+// * Velocity abs max * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::velocity_abs_max(T*  _RESTRICT umax, T*  _RESTRICT vmax, T*  _RESTRICT wmax,
+	const T*  _RESTRICT const U, const T*  _RESTRICT const V, const T*  _RESTRICT const W,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, shidx, idx;
+	T um = (T)0, vm = (T)0, wm = (T)0;
+
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+
+	T um_local, vm_local, wm_local;
+
+#pragma omp parallel private(i, j, k, shidx, idx, um_local, vm_local, wm_local) shared(um, vm, wm)
+	{
+		um_local = (T)0;
+		vm_local = (T)0;
+		wm_local = (T)0;
+
+#pragma omp for nowait
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					if (fabs(U[idx]) > um_local) um_local = fabs(U[idx]);
+					if (fabs(V[idx]) > vm_local) vm_local = fabs(V[idx]);
+					if (fabs(W[idx]) > wm_local) wm_local = fabs(W[idx]);
+				}
+			}
+		}
+
+#pragma omp critical
+		{
+			if (um_local > um) um = um_local;
+			if (vm_local > vm) vm = vm_local;
+			if (wm_local > wm) wm = wm_local;
+		}
+	}
+
+#else
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private(i, j, k, idx) reduction(max: um, vm, wm) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i, j, k, shidx, idx) reduction(max: um, vm, wm)
+#endif
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				if (fabs(U[idx]) > um) um = fabs(U[idx]);
+				if (fabs(V[idx]) > vm) vm = fabs(V[idx]);
+				if (fabs(W[idx]) > wm) wm = fabs(W[idx]);
+			}
+		}
+	}
+
+#endif
+
+	mpi_allreduce(&um, &vm, &wm, MPI_MAX, grid.mpi_com.comm);
+
+	(*umax) = um;
+	(*vmax) = vm;
+	(*wmax) = wm;
+}
+
+template< typename T >
+void nse::velocity_abs_max(T*  _RESTRICT umax, T*  _RESTRICT vmax, T*  _RESTRICT wmax,
+	const T*  _RESTRICT const U, const T*  _RESTRICT const V, const T*  _RESTRICT const W,
+	const int* _RESTRICT const mask,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, shidx, idx;
+	T um = (T)0, vm = (T)0, wm = (T)0;
+
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+
+	T um_local, vm_local, wm_local;
+
+#pragma omp parallel private(i, j, k, shidx, idx, um_local, vm_local, wm_local) shared(um, vm, wm)
+	{
+		um_local = (T)0;
+		vm_local = (T)0;
+		wm_local = (T)0;
+
+#pragma omp for nowait
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					if ((mask[idx] != solidCell) && (mask[idx - grid.nyz] != solidCell))
+						if (fabs(U[idx]) > um_local) um_local = fabs(U[idx]);
+					if ((mask[idx] != solidCell) && (mask[idx - grid.nz] != solidCell))
+						if (fabs(V[idx]) > vm_local) vm_local = fabs(V[idx]);
+					if ((mask[idx] != solidCell) && (mask[idx - 1] != solidCell))
+						if (fabs(W[idx]) > wm_local) wm_local = fabs(W[idx]);
+				}
+			}
+		}
+
+#pragma omp critical
+		{
+			if (um_local > um) um = um_local;
+			if (vm_local > vm) vm = vm_local;
+			if (wm_local > wm) wm = wm_local;
+		}
+	}
+
+#else
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private(i, j, k, idx) reduction(max: um, vm, wm) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i, j, k, shidx, idx) reduction(max: um, vm, wm)
+#endif
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				if ((mask[idx] != solidCell) && (mask[idx - grid.nyz] != solidCell))
+					if (fabs(U[idx]) > um) um = fabs(U[idx]);
+				if ((mask[idx] != solidCell) && (mask[idx - grid.nz] != solidCell))
+					if (fabs(V[idx]) > vm) vm = fabs(V[idx]);
+				if ((mask[idx] != solidCell) && (mask[idx - 1] != solidCell))
+					if (fabs(W[idx]) > wm) wm = fabs(W[idx]);
+			}
+		}
+	}
+
+#endif
+
+	mpi_allreduce(&um, &vm, &wm, MPI_MAX, grid.mpi_com.comm);
+
+	(*umax) = um;
+	(*vmax) = vm;
+	(*wmax) = wm;
+}
+// ------------------------------------------------------------------------ //
+
+// * Sub slice[x||y||z] from field * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::sub_slice(T* _RESTRICT Xdev,
+	const T* _RESTRICT const X, const T* _RESTRICT const Xavg,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	int i, j, k, idx, avg_idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	const int xmode = (axis == axisX) ? 1 : 0;
+	const int ymode = (axis == axisY) ? 1 : 0;
+	const int zmode = (axis == axisZ) ? 1 : 0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx, avg_idx ) shared( Xdev ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx, avg_idx ) shared( Xdev )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				avg_idx = i * xmode + j * ymode + k * zmode;
+
+				Xdev[idx] = X[idx] - Xavg[avg_idx];
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * Add random noise to field * //
+// -------------------------------------------------------------------- //
+template< typename T >
+void nse::add_disturbance(T* _RESTRICT X,
+	const T variance, const long int seed, const Grid3d< T >& grid)
+{
+	int i, j, k, shidx, idx;
+	GaussRand gen;
+
+	gen.set((double)0, fabs((double)variance), seed);
+
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+				X[idx] += (T)gen.mt_rand();	// gauss noise //
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::add_disturbance(T* _RESTRICT X, const T variance, const long int seed,
+	const T zmin, const T zmax, const Grid3d< T >& grid)
+{
+	int i, j, k, shidx, idx;
+	GaussRand gen;
+
+	gen.set((double)0, fabs((double)variance), seed);
+
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+				if ((grid.pz[k] >= zmin) && (grid.pz[k] <= zmax)) {
+					X[idx] += (T)gen.mt_rand();	// gauss noise //
+				}
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * init layered field * //
+// -------------------------------------------------------------------- //
+template< typename T >
+void nse::c_init_layers(T* _RESTRICT X, const T val_down, const T val_up,
+	const T z_layer, const T layer_eps, const Grid3d< T >& grid)
+{
+	int i, j, k, shidx, idx;
+	T value;
+
+#pragma omp parallel for private( i, j, k, shidx, idx, value ) shared( X )
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++)
+	{
+		value = val_down +
+			(val_up - val_down) * linear_step(grid.pz[k] - z_layer, layer_eps);
+
+		shidx = grid.gcx * grid.nyz + grid.gcy * grid.nz + k;
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++, shidx += grid.nyz)
+		{
+			idx = shidx;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, idx += grid.nz) {
+				X[idx] = value;
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * init linear field * //
+// -------------------------------------------------------------------- //
+template< typename T >
+void nse::c_init_linear(T* _RESTRICT X, const T val_down, const T val_up,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, shidx, idx;
+	T value;
+
+#pragma omp parallel for private( i, j, k, shidx, idx, value ) shared( X )
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++)
+	{
+		value = val_down +
+			(val_up - val_down) * (grid.pz[k] / grid.mpi_width);
+
+		shidx = grid.gcx * grid.nyz + grid.gcy * grid.nz + k;
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++, shidx += grid.nyz)
+		{
+			idx = shidx;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, idx += grid.nz) {
+				X[idx] = value;
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_init_linear(T* _RESTRICT X, const T val_down, const T val_up,
+	const int* _RESTRICT const mask,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, shidx, idx;
+	T value;
+
+#pragma omp parallel for private( i, j, k, shidx, idx, value ) shared( X )
+	for (k = grid.gcz; k < grid.nz - grid.gcz; k++)
+	{
+		value = val_down +
+			(val_up - val_down) * (grid.pz[k] / grid.mpi_width);
+
+		shidx = grid.gcx * grid.nyz + grid.gcy * grid.nz + k;
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++, shidx += grid.nyz)
+		{
+			idx = shidx;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, idx += grid.nz) 
+			{
+				if (mask[idx] == solidCell) continue;
+				X[idx] = value;
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * set linear profile for 3D field * //
+// -------------------------------------------------------------------- //
+//		: X = valueH - (z0 + H - z) * Xgrad
+template< typename T >
+void nse::c_set_linear_profile_z_down(T* _RESTRICT X,
+	const T valueH, const T Xgrad, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				T zpos = (grid.mpi_z + grid.mpi_height - grid.pz[k]);
+				X[idx] = valueH - Xgrad * zpos;
+			}
+		}
+	}
+}
+
+//		: X = value0 + z * Xgrad
+template< typename T >
+void nse::c_set_linear_profile_z_up(T* _RESTRICT X,
+	const T value0, const T Xgrad, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				X[idx] = value0 + Xgrad * grid.pz[k];
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * set profile for 3D field * //
+//		: X = profile(d), d = (z0 + H - z)
+template< typename T >
+void nse::c_set_profile_z_down(T* _RESTRICT X,
+	const T* _RESTRICT const profile, const T* _RESTRICT const coord, const int np, 
+	const T Xshift, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				T d = grid.mpi_z + grid.mpi_height - grid.pz[k];
+				X[idx] = interp(d, profile, coord, np) + Xshift;
+			}
+		}
+	}
+}
+
+template< typename T >
+bool nse::c_set_profile_z_down(T* _RESTRICT X,
+	const std::string& filename, 
+	const T Xshift, const Grid3d< T >& grid)
+{
+	T *profile, *coord;
+	int np;
+
+	if (!mpi_read_plain_1d(filename,
+		&profile, &coord, &np, grid.mpi_com.comm)) return false;
+	if (np == 0) return false;
+
+	c_set_profile_z_down(X, profile, coord, np, Xshift, grid);
+	deallocate(coord, profile);
+	return true;
+}
+
+//		: X = profile(z)
+template< typename T >
+void nse::c_set_profile_z_up(T* _RESTRICT X,
+	const T* _RESTRICT const profile, const T* _RESTRICT const coord, const int np, 
+	const T Xshift, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				X[idx] = interp(grid.pz[k], profile, coord, np);
+			}
+		}
+	}
+}
+
+template< typename T >
+bool nse::c_set_profile_z_up(T* _RESTRICT X,
+	const std::string& filename, 
+	const T Xshift, const Grid3d< T >& grid)
+{
+	T *profile, *coord;
+	int np;
+
+	if (!mpi_read_plain_1d(filename,
+		&profile, &coord, &np, grid.mpi_com.comm)) return false;
+	if (np == 0) return false;
+
+	c_set_profile_z_up(X, profile, coord, np, Xshift, grid);
+	deallocate(coord, profile);
+	return true;
+}
+// -------------------------------------------------------------------- //
+
+// * plane distance [axis = XY,XZ,YZ] * //
+// -------------------------------------------------------------------- //
+template< typename T >
+void nse::c_plane_dist(T* _RESTRICT dist, const T pconst,
+	const nse_const3d::axisType axis, const  Grid3d< T >& grid)
+{
+	if (axis == axisXY) {
+
+		int k;
+		for (k = grid.gcz; k < grid.nz - grid.gcz; k++) {
+			dist[k] = fabs(grid.pz[k] - pconst);
+		}
+		return;
+	}
+	if (axis == axisXZ) {
+
+		int j;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++) {
+			dist[j] = fabs(grid.py[j] - pconst);
+		}
+		return;
+	}
+	if (axis == axisYZ) {
+
+		int i;
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++) {
+			dist[i] = fabs(grid.px[i] - pconst);
+		}
+		return;
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * apply mask * //
+template< typename T >
+void nse::apply_mask(T* _RESTRICT U, T* _RESTRICT V, T* _RESTRICT W,
+	const int* _RESTRICT const mask, const T Usolid, const T Vsolid, const T Wsolid,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( U, V, W ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( U, V, W )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) 
+			{
+				if ((mask[idx] == solidCell) || (mask[idx - grid.nyz] == solidCell)) {
+					U[idx] = Usolid;
+				}
+				if ((mask[idx] == solidCell) || (mask[idx - grid.nz] == solidCell)) {
+					V[idx] = Vsolid;
+				}
+				if ((mask[idx] == solidCell) || (mask[idx - 1] == solidCell)) {
+					W[idx] = Wsolid;
+				}
+				
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::apply_mask(T* _RESTRICT X,
+	const int* _RESTRICT const mask, const T Xsolid,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) 
+			{
+				if (mask[idx] == solidCell) X[idx] = Xsolid;
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * add 2D field * //
+template< typename T >
+void nse::add_2d(T* _RESTRICT X,
+	const T* _RESTRICT const Xadd, const Grid3d< T >& grid)
+{
+	int i, j, k, idx, pidx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx, pidx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx, pidx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			pidx = i * grid.ny + j;
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				X[idx] += Xadd[pidx];
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * -z damping layer * //
+template< typename T >
+void nse::u_damping_z(T* _RESTRICT U_rhs, const T* _RESTRICT const U,
+	const T U_steady, const T f, 
+	const T z1, const T z2, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( U_rhs ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( U_rhs )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				if ((grid.pz[k] >= z1) && (grid.pz[k] <= z2))
+				{
+					T sin_damping = sin((T)0.5 * (T)M_PI * ((T)1.0 - (z2 - grid.pz[k]) / (z2 - z1)));
+					U_rhs[idx] -= f * sin_damping * sin_damping * (U[idx] - U_steady);
+				}
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_damping_z(T* _RESTRICT V_rhs, const T* _RESTRICT const V,
+	const T V_steady, const T f, 
+	const T z1, const T z2, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( V_rhs ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( V_rhs )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				if ((grid.pz[k] >= z1) && (grid.pz[k] <= z2))
+				{
+					T sin_damping = sin((T)0.5 * (T)M_PI * ((T)1.0 - (z2 - grid.pz[k]) / (z2 - z1)));
+					V_rhs[idx] -= f * sin_damping * sin_damping * (V[idx] - V_steady);
+				}
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_damping_z(T* _RESTRICT W_rhs, const T* _RESTRICT const W,
+	const T W_steady, const T f, 
+	const T z1, const T z2, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W_rhs ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W_rhs )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				if ((grid.ez[k] >= z1) && (grid.ez[k] <= z2))
+				{
+					T sin_damping = sin((T)0.5 * (T)M_PI * ((T)1.0 - (z2 - grid.ez[k]) / (z2 - z1)));
+					W_rhs[idx] -= f * sin_damping * sin_damping * (W[idx] - W_steady);
+				}
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_damping_z(T* _RESTRICT C_rhs, const T* _RESTRICT const C,
+	const T C_steady, const T f,
+	const T z1, const T z2, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C_rhs ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C_rhs )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				if ((grid.pz[k] >= z1) && (grid.pz[k] <= z2))
+				{
+					T sin_damping = sin((T)0.5 * (T)M_PI * ((T)1.0 - (z2 - grid.pz[k]) / (z2 - z1)));
+					C_rhs[idx] -= f * sin_damping * sin_damping * (C[idx] - C_steady);
+				}
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_damping_z(T* _RESTRICT C_rhs, const T* _RESTRICT const C,
+	const T* _RESTRICT const C_steady, const T f,
+	const T z1, const T z2, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C_rhs ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C_rhs )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				if ((grid.pz[k] >= z1) && (grid.pz[k] <= z2))
+				{
+					T sin_damping = sin((T)0.5 * (T)M_PI * ((T)1.0 - (z2 - grid.pz[k]) / (z2 - z1)));
+					C_rhs[idx] -= f * sin_damping * sin_damping * (C[idx] - C_steady[k]);
+				}
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * count * //
+template< typename T >
+int nse::get_num(const int* _RESTRICT const value, const int a, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	int num = 0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) collapse( 2 ) reduction( + : num)
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) reduction( + : num)
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				if (value[idx] == a) num++;
+			}
+		}
+	}
+
+	mpi_allreduce(&num, MPI_SUM, grid.mpi_com.comm);
+	return num;
+}
+
+template< typename T >
+void nse::get_num(int* _RESTRICT a_num, int* _RESTRICT b_num,
+	const int* _RESTRICT const value, const int a, const int b, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	int na = 0, nb = 0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) collapse( 2 ) reduction( + : na, nb)
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) reduction( + : na, nb)
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				if (value[idx] == a) na++;
+				if (value[idx] == b) nb++;
+			}
+		}
+	}
+	(*a_num) = na;
+	(*b_num) = nb;
+
+	mpi_allreduce(a_num, b_num, MPI_SUM, grid.mpi_com.comm);
+}
+
+template< typename T >
+T nse::get_fraction(const int* _RESTRICT const value, const int a, const Grid3d< T >& grid)
+{
+	int num = get_num(value, a, grid);
+	int grid_size = (grid.mpi_nx - 2 * grid.gcx) * (grid.mpi_ny - 2 * grid.gcy) * (grid.mpi_nz - 2 * grid.gcz);
+
+	return ((T)num) / ((T)grid_size);
+}
+
+template< typename T >
+void nse::get_fraction(T* _RESTRICT a_frac, T* _RESTRICT b_frac,
+	const int* _RESTRICT const value, const int a, const int b, const Grid3d< T >& grid)
+{
+	int a_num, b_num;
+	get_num(&a_num, &b_num, value, a, b, grid);
+	int grid_size = (grid.mpi_nx - 2 * grid.gcx) * (grid.mpi_ny - 2 * grid.gcy) * (grid.mpi_nz - 2 * grid.gcz);
+
+	(*a_frac) = ((T)a_num) / ((T)grid_size);
+	(*b_frac) = ((T)b_num) / ((T)grid_size);
+}
+// -------------------------------------------------------------------- //
+
+// * variance * //
+template< typename T >
+void nse::u_variance(T* _RESTRICT u_variance,
+	const T* _RESTRICT const U2, const T* _RESTRICT const U,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( u_variance ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx + 1; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( u_variance )
+	for (i = grid.gcx; i < grid.nx - grid.gcx + 1; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				u_variance[idx] = U2[idx] - U[idx] * U[idx];
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_variance(T* _RESTRICT v_variance,
+	const T* _RESTRICT const V2, const T* _RESTRICT const V,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( v_variance ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy + 1; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( v_variance )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy + 1; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				v_variance[idx] = V2[idx] - V[idx] * V[idx];
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_variance(T* _RESTRICT w_variance,
+	const T* _RESTRICT const W2, const T* _RESTRICT const W,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( w_variance ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( w_variance )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz + 1; k++, idx++)
+			{
+				w_variance[idx] = W2[idx] - W[idx] * W[idx];
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_variance(T* _RESTRICT c_variance,
+	const T* _RESTRICT const C2, const T* _RESTRICT const C,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( c_variance ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( c_variance )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				c_variance[idx] = C2[idx] - C[idx] * C[idx];
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * variance + deviation * //
+template< typename T >
+void nse::u_variance(T* _RESTRICT u_variance, T* _RESTRICT u_deviation,
+	const T* _RESTRICT const U2, const T* _RESTRICT const U,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( u_variance, u_deviation ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx + 1; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( u_variance, u_deviation )
+	for (i = grid.gcx; i < grid.nx - grid.gcx + 1; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				u_variance[idx] = U2[idx] - U[idx] * U[idx];
+				u_deviation[idx] = sqrt(u_variance[idx]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_variance(T* _RESTRICT v_variance, T* _RESTRICT v_deviation,
+	const T* _RESTRICT const V2, const T* _RESTRICT const V,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( v_variance, v_deviation ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy + 1; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( v_variance, v_deviation )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy + 1; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				v_variance[idx] = V2[idx] - V[idx] * V[idx];
+				v_deviation[idx] = sqrt(v_variance[idx]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_variance(T* _RESTRICT w_variance, T* _RESTRICT w_deviation,
+	const T* _RESTRICT const W2, const T* _RESTRICT const W,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( w_variance, w_deviation ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( w_variance, w_deviation )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz + 1; k++, idx++)
+			{
+				w_variance[idx] = W2[idx] - W[idx] * W[idx];
+				w_deviation[idx] = sqrt(w_variance[idx]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_variance(T* _RESTRICT c_variance, T* _RESTRICT c_deviation,
+	const T* _RESTRICT const C2, const T* _RESTRICT const C,
+	const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( c_variance, c_deviation ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( c_variance, c_deviation )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				c_variance[idx] = C2[idx] - C[idx] * C[idx];
+				c_deviation[idx] = sqrt(c_variance[idx]);
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * update * //
+// -------------------------------------------------------------------- //
+template< typename T >
+void nse::c_update(T* _RESTRICT X,
+	const T alpha, const T* _RESTRICT const Y, const Grid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( X ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( X )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++)
+			{
+				X[idx] += alpha * Y[idx];
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+
+// * initialize: time advancement * //
+template void nse::adams_bashforth_x2(float*  _RESTRICT X, float*  _RESTRICT Xp,
+	const Grid3d< float >& grid);
+template void nse::adams_bashforth_x2(double*  _RESTRICT X, double*  _RESTRICT Xp,
+	const Grid3d< double >& grid);
+
+template void nse::adams_bashforth_x2(float*  _RESTRICT X, float*  _RESTRICT Xp,
+	const Grid3d< float >& grid, const float dt, const float p_dt);
+template void nse::adams_bashforth_x2(double*  _RESTRICT X, double*  _RESTRICT Xp,
+	const Grid3d< double >& grid, const double dt, const double p_dt);
+
+template void nse::adams_bashforth_x2(float*  _RESTRICT X, float*  _RESTRICT Xp,
+	const float eps, const Grid3d< float >& grid);
+template void nse::adams_bashforth_x2(double*  _RESTRICT X, double*  _RESTRICT Xp,
+	const double eps, const Grid3d< double >& grid);
+
+template void nse::adams_bashforth_x3(float*  _RESTRICT X, float*  _RESTRICT Xp, float*  _RESTRICT Xpp,
+	const Grid3d< float >& grid);
+template void nse::adams_bashforth_x3(double*  _RESTRICT X, double*  _RESTRICT Xp, double*  _RESTRICT Xpp,
+	const Grid3d< double >& grid);
+
+template void nse::adams_bashforth_x3(float*  _RESTRICT X, float*  _RESTRICT Xp, float*  _RESTRICT Xpp,
+	const Grid3d< float >& grid,
+	const float dt, const float p_dt, const float pp_dt);
+template void nse::adams_bashforth_x3(double*  _RESTRICT X, double*  _RESTRICT Xp, double*  _RESTRICT Xpp,
+	const Grid3d< double >& grid,
+	const double dt, const double p_dt, const double pp_dt);
+
+template void nse::adams_bashforth_x2(float*  _RESTRICT X, float*  _RESTRICT Xp,
+	const int* _RESTRICT const mask, const Grid3d< float >& grid);
+template void nse::adams_bashforth_x2(double*  _RESTRICT X, double*  _RESTRICT Xp,
+	const int* _RESTRICT const mask, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: scalar min-max * //
+template void nse::scalar_min_max(float*  _RESTRICT fmin, float*  _RESTRICT fmax,
+	const float*  _RESTRICT const F, const Grid3d< float >& grid);
+template void nse::scalar_min_max(double*  _RESTRICT fmin, double*  _RESTRICT fmax,
+	const double*  _RESTRICT const F, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: velocity abs max * //
+template void nse::velocity_abs_max(float*  _RESTRICT umax, float*  _RESTRICT vmax, float*  _RESTRICT wmax,
+	const float*  _RESTRICT const U, const float*  _RESTRICT const V, const float*  _RESTRICT const W,
+	const Grid3d< float >& grid);
+template void nse::velocity_abs_max(double*  _RESTRICT umax, double*  _RESTRICT vmax, double*  _RESTRICT wmax,
+	const double*  _RESTRICT const U, const double*  _RESTRICT const V, const double*  _RESTRICT const W,
+	const Grid3d< double >& grid);
+
+template void nse::velocity_abs_max(float*  _RESTRICT umax, float*  _RESTRICT vmax, float*  _RESTRICT wmax,
+	const float*  _RESTRICT const U, const float*  _RESTRICT const V, const float*  _RESTRICT const W,
+	const int* _RESTRICT const mask, const Grid3d< float >& grid);
+template void nse::velocity_abs_max(double*  _RESTRICT umax, double*  _RESTRICT vmax, double*  _RESTRICT wmax,
+	const double*  _RESTRICT const U, const double*  _RESTRICT const V, const double*  _RESTRICT const W,
+	const int* _RESTRICT const mask, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: sub slice[x||y||z] from field * //
+template void nse::sub_slice(float* _RESTRICT Xdev,
+	const float* _RESTRICT const X, const float* _RESTRICT const Xavg,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template void nse::sub_slice(double* _RESTRICT Xdev,
+	const double* _RESTRICT const X, const double* _RESTRICT const Xavg,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * initialize: add random noise to field * //
+template void nse::add_disturbance(float* _RESTRICT X,
+	const float variance, const long int seed, const Grid3d< float >& grid);
+template void nse::add_disturbance(double* _RESTRICT X,
+	const double variance, const long int seed, const Grid3d< double >& grid);
+
+template void nse::add_disturbance(float* _RESTRICT X,
+	const float variance, const long int seed, 
+	const float zmin, const float zmax, const Grid3d< float >& grid);
+template void nse::add_disturbance(double* _RESTRICT X,
+	const double variance, const long int seed, 
+	const double zmin, const double zmax, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * init layered field * //
+template void nse::c_init_layers(float* _RESTRICT X,
+	const float val_down, const float val_up,
+	const float z_layer, const float layer_eps, const Grid3d< float >& grid);
+template void nse::c_init_layers(double* _RESTRICT X,
+	const double val_down, const double val_up,
+	const double z_layer, const double layer_eps, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * initialize: init linear field * //
+template void nse::c_init_linear(float* _RESTRICT X,
+	const float val_down, const float val_up,
+	const Grid3d< float >& grid);
+template void nse::c_init_linear(double* _RESTRICT X,
+	const double val_down, const double val_up,
+	const Grid3d< double >& grid);
+
+template void nse::c_init_linear(float* _RESTRICT X,
+	const float val_down, const float val_up,
+	const int* _RESTRICT const mask,
+	const Grid3d< float >& grid);
+template void nse::c_init_linear(double* _RESTRICT X,
+	const double val_down, const double val_up,
+	const int* _RESTRICT const mask,
+	const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * set linear profile for 3D field * //
+// -------------------------------------------------------------------- //
+template void nse::c_set_linear_profile_z_down(float* _RESTRICT X,
+	const float valueH, const float Xgrad, const Grid3d< float >& grid);
+template void nse::c_set_linear_profile_z_down(double* _RESTRICT X,
+	const double valueH, const double Xgrad, const Grid3d< double >& grid);
+
+template void nse::c_set_linear_profile_z_up(float* _RESTRICT X,
+	const float value0, const float Xgrad, const Grid3d< float >& grid);
+template void nse::c_set_linear_profile_z_up(double* _RESTRICT X,
+	const double value0, const double Xgrad, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * set profile for 3D field * //
+template void nse::c_set_profile_z_down(float* _RESTRICT X,
+	const float* _RESTRICT const profile, const float* _RESTRICT const coord, const int np,
+	const float Xshift, const Grid3d< float >& grid);
+template void nse::c_set_profile_z_down(double* _RESTRICT X,
+	const double* _RESTRICT const profile, const double* _RESTRICT const coord, const int np,
+	const double Xshift, const Grid3d< double >& grid);
+
+template bool nse::c_set_profile_z_down(float* _RESTRICT X,
+	const std::string& filename,
+	const float Xshift, const Grid3d< float >& grid);
+template bool nse::c_set_profile_z_down(double* _RESTRICT X,
+	const std::string& filename,
+	const double Xshift, const Grid3d< double >& grid);
+
+template void nse::c_set_profile_z_up(float* _RESTRICT X,
+	const float* _RESTRICT const profile, const float* _RESTRICT const coord, const int np,
+	const float Xshift, const Grid3d< float >& grid);
+template void nse::c_set_profile_z_up(double* _RESTRICT X,
+	const double* _RESTRICT const profile, const double* _RESTRICT const coord, const int np,
+	const double Xshift, const Grid3d< double >& grid);
+
+template bool nse::c_set_profile_z_up(float* _RESTRICT X,
+	const std::string& filename,
+	const float Xshift, const Grid3d< float >& grid);
+template bool nse::c_set_profile_z_up(double* _RESTRICT X,
+	const std::string& filename,
+	const double Xshift, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * plane distance * //
+template void nse::c_plane_dist(float* _RESTRICT dist,
+	const float pconst, const nse_const3d::axisType axis, const  Grid3d< float >& grid);
+template void nse::c_plane_dist(double* _RESTRICT dist,
+	const double pconst, const nse_const3d::axisType axis, const  Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * apply mask * //
+template void nse::apply_mask(float* _RESTRICT U, float* _RESTRICT V, float* _RESTRICT W,
+	const int* _RESTRICT const mask, const float Usolid, const float Vsolid, const float Wsolid,
+	const Grid3d< float >& grid);
+template void nse::apply_mask(double* _RESTRICT U, double* _RESTRICT V, double* _RESTRICT W,
+	const int* _RESTRICT const mask, const double Usolid, const double Vsolid, const double Wsolid,
+	const Grid3d< double >& grid);
+
+template void nse::apply_mask(float* _RESTRICT X,
+	const int* _RESTRICT const mask, const float Xsolid,
+	const Grid3d< float >& grid);
+template void nse::apply_mask(double* _RESTRICT X,
+	const int* _RESTRICT const mask, const double Xsolid,
+	const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * add 2D field * //
+template void nse::add_2d(float* _RESTRICT X,
+	const float* _RESTRICT const Xadd, const Grid3d< float >& grid);
+template void nse::add_2d(double* _RESTRICT X,
+	const double* _RESTRICT const Xadd, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * -z damping layer * //
+template void nse::u_damping_z(float* _RESTRICT U_rhs, const float* _RESTRICT const U,
+	const float U_steady, const float f, 
+	const float z1, const float z2, const Grid3d< float >& grid);
+template void nse::u_damping_z(double* _RESTRICT U_rhs, const double* _RESTRICT const U,
+	const double U_steady, const double f, 
+	const double z1, const double z2, const Grid3d< double >& grid);
+
+template void nse::v_damping_z(float* _RESTRICT V_rhs, const float* _RESTRICT const V,
+	const float V_steady, const float f, 
+	const float z1, const float z2, const Grid3d< float >& grid);
+template void nse::v_damping_z(double* _RESTRICT V_rhs, const double* _RESTRICT const V,
+	const double V_steady, const double f, 
+	const double z1, const double z2, const Grid3d< double >& grid);
+
+template void nse::w_damping_z(float* _RESTRICT W_rhs, const float* _RESTRICT const W,
+	const float W_steady, const float f, 
+	const float z1, const float z2, const Grid3d< float >& grid);
+template void nse::w_damping_z(double* _RESTRICT W_rhs, const double* _RESTRICT const W,
+	const double W_steady, const double f, 
+	const double z1, const double z2, const Grid3d< double >& grid);
+
+template void nse::c_damping_z(float* _RESTRICT C_rhs, const float* _RESTRICT const C,
+	const float C_steady, const float f,
+	const float z1, const float z2, const Grid3d< float >& grid);
+template void nse::c_damping_z(double* _RESTRICT C_rhs, const double* _RESTRICT const C,
+	const double C_steady, const double f,
+	const double z1, const double z2, const Grid3d< double >& grid);
+
+template void nse::c_damping_z(float* _RESTRICT C_rhs, const float* _RESTRICT const C,
+	const float* _RESTRICT const C_steady, const float f,
+	const float z1, const float z2, const Grid3d< float >& grid);
+template void nse::c_damping_z(double* _RESTRICT C_rhs, const double* _RESTRICT const C,
+	const double* _RESTRICT const C_steady, const double f,
+	const double z1, const double z2, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * initialize: count * //
+template int nse::get_num(const int* _RESTRICT const value, 
+	const int a, const Grid3d< float >& grid);
+template int nse::get_num(const int* _RESTRICT const value,
+	const int a, const Grid3d< double >& grid);
+
+template void nse::get_num(int* _RESTRICT a_num, int* _RESTRICT b_num,
+	const int* _RESTRICT const value, const int a, const int b, const Grid3d< float >& grid);
+template void nse::get_num(int* _RESTRICT a_num, int* _RESTRICT b_num,
+	const int* _RESTRICT const value, const int a, const int b, const Grid3d< double >& grid);
+
+template float nse::get_fraction(const int* _RESTRICT const value, const int a, const Grid3d< float >& grid);
+template double nse::get_fraction(const int* _RESTRICT const value, const int a, const Grid3d< double >& grid);
+
+template void nse::get_fraction(float* _RESTRICT a_frac, float* _RESTRICT b_frac,
+	const int* _RESTRICT const value, const int a, const int b, const Grid3d< float >& grid);
+template void nse::get_fraction(double* _RESTRICT a_frac, double* _RESTRICT b_frac,
+	const int* _RESTRICT const value, const int a, const int b, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * initialize: variance * //
+template void nse::u_variance(float* _RESTRICT u_variance,
+	const float* _RESTRICT const U2, const float* _RESTRICT const U,
+	const Grid3d< float >& grid);
+template void nse::u_variance(double* _RESTRICT u_variance,
+	const double* _RESTRICT const U2, const double* _RESTRICT const U,
+	const Grid3d< double >& grid);
+
+template void nse::v_variance(float* _RESTRICT v_variance,
+	const float* _RESTRICT const V2, const float* _RESTRICT const V,
+	const Grid3d< float >& grid);
+template void nse::v_variance(double* _RESTRICT v_variance,
+	const double* _RESTRICT const V2, const double* _RESTRICT const V,
+	const Grid3d< double >& grid);
+
+template void nse::w_variance(float* _RESTRICT w_variance,
+	const float* _RESTRICT const W2, const float* _RESTRICT const W,
+	const Grid3d< float >& grid);
+template void nse::w_variance(double* _RESTRICT w_variance,
+	const double* _RESTRICT const W2, const double* _RESTRICT const W,
+	const Grid3d< double >& grid);
+
+template void nse::c_variance(float* _RESTRICT c_variance,
+	const float* _RESTRICT const C2, const float* _RESTRICT const C,
+	const Grid3d< float >& grid);
+template void nse::c_variance(double* _RESTRICT c_variance,
+	const double* _RESTRICT const C2, const double* _RESTRICT const C,
+	const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * initialize: variance + deviation * //
+template void nse::u_variance(float* _RESTRICT u_variance, float* _RESTRICT u_deviation,
+	const float* _RESTRICT const U2, const float* _RESTRICT const U,
+	const Grid3d< float >& grid);
+template void nse::u_variance(double* _RESTRICT u_variance, double* _RESTRICT u_deviation,
+	const double* _RESTRICT const U2, const double* _RESTRICT const U,
+	const Grid3d< double >& grid);
+
+template void nse::v_variance(float* _RESTRICT v_variance, float* _RESTRICT v_deviation,
+	const float* _RESTRICT const V2, const float* _RESTRICT const V,
+	const Grid3d< float >& grid);
+template void nse::v_variance(double* _RESTRICT v_variance, double* _RESTRICT v_deviation,
+	const double* _RESTRICT const V2, const double* _RESTRICT const V,
+	const Grid3d< double >& grid);
+
+template void nse::w_variance(float* _RESTRICT w_variance, float* _RESTRICT w_deviation,
+	const float* _RESTRICT const W2, const float* _RESTRICT const W,
+	const Grid3d< float >& grid);
+template void nse::w_variance(double* _RESTRICT w_variance, double* _RESTRICT w_deviation,
+	const double* _RESTRICT const W2, const double* _RESTRICT const W,
+	const Grid3d< double >& grid);
+
+template void nse::c_variance(float* _RESTRICT c_variance, float* _RESTRICT c_deviation,
+	const float* _RESTRICT const C2, const float* _RESTRICT const C,
+	const Grid3d< float >& grid);
+template void nse::c_variance(double* _RESTRICT c_variance, double* _RESTRICT c_deviation,
+	const double* _RESTRICT const C2, const double* _RESTRICT const C,
+	const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
+
+// * initialize: update * //
+template void nse::c_update(float* _RESTRICT X,
+	const float alpha, const float* _RESTRICT const Y, const Grid3d< float >& grid);
+template void nse::c_update(double* _RESTRICT X,
+	const double alpha, const double* _RESTRICT const Y, const Grid3d< double >& grid);
+// -------------------------------------------------------------------- //
diff --git a/nse-generic3d.h b/nse-generic3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c20e9c161f9648bc26ddeeb43b7fef8bcb46e35
--- /dev/null
+++ b/nse-generic3d.h
@@ -0,0 +1,234 @@
+#pragma once
+
+// [nse-generic3d.h(cpp)]: 3D Navier-Stokes module (generic)
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "grid3d.h"
+
+#include <string>
+
+
+namespace nse
+{
+	// * adams-bashforth time advancement * //
+	template< typename T >
+	void adams_bashforth_x2(T* _RESTRICT X, T* _RESTRICT Xp,
+		const Grid3d< T >& grid);
+	template< typename T >
+	void adams_bashforth_x2(T* _RESTRICT X, T* _RESTRICT Xp,
+		const Grid3d< T >& grid, const T dt, const T p_dt);
+	template< typename T >
+	void adams_bashforth_x2(T* _RESTRICT X, T* _RESTRICT Xp,
+		const T eps, const Grid3d< T >& grid);
+	template< typename T >
+	void adams_bashforth_x3(T* _RESTRICT X, T* _RESTRICT Xp, T* _RESTRICT Xpp,
+		const Grid3d< T >& grid);
+	template< typename T >
+	void adams_bashforth_x3(T* _RESTRICT X, T* _RESTRICT Xp, T* _RESTRICT Xpp,
+		const Grid3d< T >& grid, const T dt, const T p_dt, const T pp_dt);
+
+	template< typename T >
+	void adams_bashforth_x2(T* _RESTRICT X, T* _RESTRICT Xp,
+		const int* _RESTRICT const mask,
+		const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * scalar min-max * //
+	template< typename T >
+	void scalar_min_max(T* _RESTRICT fmin, T* _RESTRICT fmax,
+		const T* _RESTRICT const F, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * velocity abs max * //
+	template< typename T >
+	void velocity_abs_max(T* _RESTRICT umax, T* _RESTRICT vmax, T* _RESTRICT wmax,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const Grid3d< T >& grid);
+
+	template< typename T >
+	void velocity_abs_max(T* _RESTRICT umax, T* _RESTRICT vmax, T* _RESTRICT wmax,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const int* _RESTRICT const mask,
+		const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * sub slice[x||y||z] from field * //
+	template< typename T >
+	void sub_slice(T* _RESTRICT Xdev,
+		const T* _RESTRICT const X, const T* _RESTRICT const Xavg,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * add random disturbance * //
+	template< typename T >
+	void add_disturbance(T* _RESTRICT X, const T variance, const long int seed,
+		const Grid3d< T >& grid);
+
+	template< typename T >
+	void add_disturbance(T* _RESTRICT X, const T variance, const long int seed,
+		const T zmin, const T zmax, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * init layered field * //
+	template< typename T >
+	void c_init_layers(T* _RESTRICT X, const T val_down, const T val_up,
+		const T z_layer, const T layer_eps, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * init linear field * //
+	template< typename T >
+	void c_init_linear(T* _RESTRICT X, const T val_down, const T val_up,
+		const Grid3d< T >& grid);
+
+	template< typename T >
+	void c_init_linear(T* _RESTRICT X, const T val_down, const T val_up,
+		const int* _RESTRICT const mask,
+		const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * set linear profile for 3D field * //
+	//		: X = valueH - d * Xgrad, d = (z0 + H - z)
+	template< typename T >
+	void c_set_linear_profile_z_down(T* _RESTRICT X,
+		const T valueH, const T Xgrad, const Grid3d< T >& grid);
+
+	//		: X = value0 + z * Xgrad
+	template< typename T >
+	void c_set_linear_profile_z_up(T* _RESTRICT X,
+		const T value0, const T Xgrad, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * set profile for 3D field * //
+	//		: X = profile(d), d = (z0 + H - z)
+	template< typename T >
+	void c_set_profile_z_down(T* _RESTRICT X,
+		const T* _RESTRICT const profile, const T* _RESTRICT const coord, const int np, 
+		const T Xshift, const Grid3d< T >& grid);
+	template< typename T >
+	bool c_set_profile_z_down(T* _RESTRICT X,
+		const std::string& filename, 
+		const T Xshift, const Grid3d< T >& grid);
+
+	//		: X = profile(z)
+	template< typename T >
+	void c_set_profile_z_up(T* _RESTRICT X,
+		const T* _RESTRICT const profile, const T* _RESTRICT const coord, const int np, 
+		const T Xshift, const Grid3d< T >& grid);
+	template< typename T >
+	bool c_set_profile_z_up(T* _RESTRICT X,
+		const std::string& filename, 
+		const T Xshift, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * plane distance [axis = XY,XZ,YZ] * //
+	// -------------------------------------------------------------------- //
+	template< typename T >
+	void c_plane_dist(T* _RESTRICT dist, const T pconst,
+		const nse_const3d::axisType axis, const  Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * apply mask * //
+	template< typename T >
+	void apply_mask(T* _RESTRICT U, T* _RESTRICT V, T* _RESTRICT W,
+		const int* _RESTRICT const mask, const T Usolid, const T Vsolid, const T Wsolid,
+		const Grid3d< T >& grid);
+	template< typename T >
+	void apply_mask(T* _RESTRICT X,
+		const int* _RESTRICT const mask, const T Xsolid,
+		const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+
+	// * add 2D field * //
+	template< typename T >
+	void add_2d(T* _RESTRICT X, const T* _RESTRICT const Xadd, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * -z damping layer * //
+	template< typename T >
+	void u_damping_z(T* _RESTRICT U_rhs, const T* _RESTRICT const U,
+		const T U_steady, const T f, 
+		const T z1, const T z2, const Grid3d< T >& grid);
+	template< typename T >
+	void v_damping_z(T* _RESTRICT V_rhs, const T* _RESTRICT const V,
+		const T V_steady, const T f, 
+		const T z1, const T z2, const Grid3d< T >& grid);
+	template< typename T >
+	void w_damping_z(T* _RESTRICT W_rhs, const T* _RESTRICT const W,
+		const T W_steady, const T f, 
+		const T z1, const T z2, const Grid3d< T >& grid);
+
+	template< typename T >
+	void c_damping_z(T* _RESTRICT C_rhs, const T* _RESTRICT const C,
+		const T C_steady, const T f,
+		const T z1, const T z2, const Grid3d< T >& grid);
+	template< typename T >
+	void c_damping_z(T* _RESTRICT C_rhs, const T* _RESTRICT const C,
+		const T* _RESTRICT const C_steady, const T f,
+		const T z1, const T z2, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * count * //
+	template< typename T >
+	int get_num(const int* _RESTRICT const value, const int a, const Grid3d< T >& grid);
+	template< typename T >
+	void get_num(int* _RESTRICT a_num, int* _RESTRICT b_num,
+		const int* _RESTRICT const value, const int a, const int b, const Grid3d< T >& grid);
+
+	template< typename T >
+	T get_fraction(const int* _RESTRICT const value, const int a, const Grid3d< T >& grid);
+	template< typename T >
+	void get_fraction(T* _RESTRICT a_frac, T* _RESTRICT b_frac,
+		const int* _RESTRICT const value, const int a, const int b, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * variance * //
+	template< typename T >
+	void u_variance(T* _RESTRICT u_variance,
+		const T* _RESTRICT const U2, const T* _RESTRICT const U,
+		const Grid3d< T >& grid);
+	template< typename T >
+	void v_variance(T* _RESTRICT v_variance,
+		const T* _RESTRICT const V2, const T* _RESTRICT const V,
+		const Grid3d< T >& grid);
+	template< typename T >
+	void w_variance(T* _RESTRICT w_variance,
+		const T* _RESTRICT const W2, const T* _RESTRICT const W,
+		const Grid3d< T >& grid);
+
+	template< typename T >
+	void c_variance(T* _RESTRICT c_variance,
+		const T* _RESTRICT const C2, const T* _RESTRICT const C,
+		const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * variance + deviation * //
+	template< typename T >
+	void u_variance(T* _RESTRICT u_variance, T* _RESTRICT u_deviation,
+		const T* _RESTRICT const U2, const T* _RESTRICT const U,
+		const Grid3d< T >& grid);
+	template< typename T >
+	void v_variance(T* _RESTRICT v_variance, T* _RESTRICT v_deviation,
+		const T* _RESTRICT const V2, const T* _RESTRICT const V,
+		const Grid3d< T >& grid);
+	template< typename T >
+	void w_variance(T* _RESTRICT w_variance, T* _RESTRICT w_deviation,
+		const T* _RESTRICT const W2, const T* _RESTRICT const W,
+		const Grid3d< T >& grid);
+
+	template< typename T >
+	void c_variance(T* _RESTRICT c_variance, T* _RESTRICT c_deviation,
+		const T* _RESTRICT const C2, const T* _RESTRICT const C,
+		const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * update * //
+	template< typename T >
+	void c_update(T* _RESTRICT X, 
+		const T alpha, const T* _RESTRICT const Y, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+}
diff --git a/nse-io3d.cpp b/nse-io3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..812d6530aa5212df1e997d36697f87690045d476
--- /dev/null
+++ b/nse-io3d.cpp
@@ -0,0 +1,3225 @@
+#define _CRT_SECURE_NO_DEPRECATE
+#include "nse-io3d.h"
+#include "str-com.h"
+
+#include "io-base1d.h"
+#include "io-base2d.h"
+#include "io-base3d.h"
+
+#include "vecmath.h"
+
+using namespace nse::nse_const3d;
+
+// * write tecplot-1d|2d|3d output, axis = n-dims * //
+template< nse::memType mem, typename T >
+bool nse::write_tecplot(const std::string& filename,
+	const T* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< T, mem >& grid, const T time)
+{
+	if ((axis == nse_const3d::axisX) || 
+		(axis == nse_const3d::axisY) ||
+		(axis == nse_const3d::axisZ))
+	{
+		return write_tecplot_1d(filename, xin, name, axis, node, grid, time);
+	}
+	if ((axis == nse_const3d::axisXY) || 
+		(axis == nse_const3d::axisXZ) ||
+		(axis == nse_const3d::axisYZ))
+	{
+		return write_tecplot_2d(filename, xin, name, axis, node, grid, time);
+	}
+
+	return write_tecplot_3d(filename, xin, name, node, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot(const std::string& filename, const int idx,
+	const T* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot(append_index(filename, idx),
+		xin, name, axis, node, grid, time);
+}
+// -------------------------------------------------------------------- //
+
+// * write tecplot-3d output, F(x,y,z) * //
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_3d(const std::string& filename,
+	const T* xin, const char* name,
+	const nse_const3d::nodeType node, const Grid3d< T, mem >& grid, const T time)
+{
+	T *out = NULL, *cx = NULL, *cy = NULL, *cz = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&out, grid.mpi_size);
+		allocate_vnull(&cx, &cy, &cz, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(out, xin, 0, axisXYZ);
+	if ((node == nodeC) || (node == nodeV) || (node == nodeW) || (node == nodeVW))
+		grid.template mpi_gather_center_coord<memCPU>(cx, 0, axisX);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(cx, 0, axisX);
+	if ((node == nodeC) || (node == nodeU) || (node == nodeW) || (node == nodeUW))
+		grid.template mpi_gather_center_coord<memCPU>(cy, 0, axisY);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(cy, 0, axisY);
+	if ((node == nodeC) || (node == nodeU) || (node == nodeV) || (node == nodeUV))
+		grid.template mpi_gather_center_coord<memCPU>(cz, 0, axisZ);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(cz, 0, axisZ);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		const int shx = (((node == nodeU) || (node == nodeUV) ||
+			(node == nodeUW) || (node == nodeUVW)) && (grid.gcx > 0)) ? 1 : 0;
+		const int shy = (((node == nodeV) || (node == nodeUV) ||
+			(node == nodeVW) || (node == nodeUVW)) && (grid.gcy > 0)) ? 1 : 0;
+		const int shz = (((node == nodeW) || (node == nodeUW) ||
+			(node == nodeVW) || (node == nodeUVW)) && (grid.gcz > 0)) ? 1 : 0;
+
+		status = write_tecplot(filename, out, cx, cy, cz,
+			grid.mpi_nx, grid.mpi_ny, grid.mpi_nz,
+			grid.gcx, grid.mpi_nx - grid.gcx + shx - 1,
+			grid.gcy, grid.mpi_ny - grid.gcy + shy - 1,
+			grid.gcz, grid.mpi_nz - grid.gcz + shz - 1,
+
+			"nse-3D", name, axis_name(axisX), axis_name(axisY), axis_name(axisZ), time);
+
+		deallocate(out);
+		deallocate(cx, cy, cz);
+	}
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_3d(const std::string& filename,
+	const T* uin, const T* vin, const T* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< T, mem >& grid, const T time)
+{
+	T *uout = NULL, *vout = NULL, *wout = NULL,
+		*cx = NULL, *cy = NULL, *cz = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&uout, &vout, &wout, grid.mpi_size);
+		allocate_vnull(&cx, &cy, &cz, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(uout, uin, 0, axisXYZ);
+	grid.template mpi_gather<memCPU, mem>(vout, vin, 0, axisXYZ);
+	grid.template mpi_gather<memCPU, mem>(wout, win, 0, axisXYZ);
+	grid.template mpi_gather_center_coord<memCPU>(cx, cy, cz, 0);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		status = write_tecplot(filename, uout, vout, wout, cx, cy, cz,
+			grid.mpi_nx, grid.mpi_ny, grid.mpi_nz,
+			grid.gcx, grid.mpi_nx - grid.gcx - 1,
+			grid.gcy, grid.mpi_ny - grid.gcy - 1,
+			grid.gcz, grid.mpi_nz - grid.gcz - 1,
+
+			"nse-3D", uname, vname, wname,
+			axis_name(axisX), axis_name(axisY), axis_name(axisZ), time);
+
+		deallocate(uout, vout, wout);
+		deallocate(cx, cy, cz);
+	}
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const T* xin, const char* name,
+	const nse_const3d::nodeType node,
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_3d(append_index(filename, idx),
+		xin, name, node, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const T* uin, const T* vin, const T* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_3d(append_index(filename, idx),
+		uin, vin, win, uname, vname, wname, grid, time);
+}
+// -------------------------------------------------------------------- //
+
+// * write tecplot-3d output [sub-domain], F(x,y,z) * //
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_3d(const std::string& filename,
+	const T* xin, const char* name,
+	const T xmin, const T xmax,
+	const T ymin, const T ymax,
+	const T zmin, const T zmax,
+	const nse_const3d::nodeType node,
+	const Grid3d< T, mem >& grid, const T time)
+{
+	int imin = grid.mpi_locate_x(xmin), imax = grid.mpi_locate_x(xmax),
+		jmin = grid.mpi_locate_y(ymin), jmax = grid.mpi_locate_y(ymax),
+		kmin = grid.mpi_locate_z(zmin), kmax = grid.mpi_locate_z(zmax);
+
+	if ((imin == -1) && (xmin <= grid.mpi_x)) imin = grid.gcx;
+	if ((jmin == -1) && (ymin <= grid.mpi_y)) jmin = grid.gcy;
+	if ((kmin == -1) && (zmin <= grid.mpi_z)) kmin = grid.gcz;
+	if ((imax == -1) && (xmax >= grid.mpi_x + grid.mpi_length))
+		imax = grid.mpi_nx - grid.gcx - 1;
+	if ((jmax == -1) && (ymax >= grid.mpi_y + grid.mpi_width))
+		jmax = grid.mpi_ny - grid.gcy - 1;
+	if ((kmax == -1) && (zmax >= grid.mpi_z + grid.mpi_height))
+		kmax = grid.mpi_nz - grid.gcz - 1;
+
+	if ((imin == -1) || (imax == -1) ||
+		(jmin == -1) || (jmax == -1) ||
+		(kmin == -1) || (kmax == -1) ||
+		(imin > imax) || (jmin > jmax) || (kmin > kmax))
+	{
+		return false;
+	}
+
+	T *out = NULL, *cx = NULL, *cy = NULL, *cz = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&out, grid.mpi_size);
+		allocate_vnull(&cx, &cy, &cz, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(out, xin, 0, axisXYZ);
+	if ((node == nodeC) || (node == nodeV) || (node == nodeW) || (node == nodeVW))
+		grid.template mpi_gather_center_coord<memCPU>(cx, 0, axisX);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(cx, 0, axisX);
+	if ((node == nodeC) || (node == nodeU) || (node == nodeW) || (node == nodeUW))
+		grid.template mpi_gather_center_coord<memCPU>(cy, 0, axisY);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(cy, 0, axisY);
+	if ((node == nodeC) || (node == nodeU) || (node == nodeV) || (node == nodeUV))
+		grid.template mpi_gather_center_coord<memCPU>(cz, 0, axisZ);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(cz, 0, axisZ);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		const int shx = (((node == nodeU) || (node == nodeUV) ||
+			(node == nodeUW) || (node == nodeUVW)) && (grid.gcx > 0)) ? 1 : 0;
+		const int shy = (((node == nodeV) || (node == nodeUV) ||
+			(node == nodeVW) || (node == nodeUVW)) && (grid.gcy > 0)) ? 1 : 0;
+		const int shz = (((node == nodeW) || (node == nodeUW) ||
+			(node == nodeVW) || (node == nodeUVW)) && (grid.gcz > 0)) ? 1 : 0;
+
+		status = write_tecplot(filename, out, cx, cy, cz,
+			grid.mpi_nx, grid.mpi_ny, grid.mpi_nz,
+			imin, imax + shx,
+			jmin, jmax + shy,
+			kmin, kmax + shz,
+
+			"nse-3D", name, axis_name(axisX), axis_name(axisY), axis_name(axisZ), time);
+
+		deallocate(out);
+		deallocate(cx, cy, cz);
+	}
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_3d(const std::string& filename,
+	const T* uin, const T* vin, const T* win,
+	const char* uname, const char* vname, const char* wname,
+	const T xmin, const T xmax,
+	const T ymin, const T ymax,
+	const T zmin, const T zmax,
+	const Grid3d< T, mem >& grid, const T time)
+{
+	int imin = grid.mpi_locate_x(xmin), imax = grid.mpi_locate_x(xmax),
+		jmin = grid.mpi_locate_y(ymin), jmax = grid.mpi_locate_y(ymax),
+		kmin = grid.mpi_locate_z(zmin), kmax = grid.mpi_locate_z(zmax);
+
+	if ((imin == -1) && (xmin <= grid.mpi_x)) imin = grid.gcx;
+	if ((jmin == -1) && (ymin <= grid.mpi_y)) jmin = grid.gcy;
+	if ((kmin == -1) && (kmin <= grid.mpi_z)) kmin = grid.gcz;
+	if ((imax == -1) && (xmax >= grid.mpi_x + grid.mpi_length))
+		imax = grid.mpi_nx - grid.gcx - 1;
+	if ((jmax == -1) && (ymax >= grid.mpi_y + grid.mpi_width))
+		jmax = grid.mpi_ny - grid.gcy - 1;
+	if ((kmax == -1) && (zmax >= grid.mpi_z + grid.mpi_height))
+		kmax = grid.mpi_nz - grid.gcz - 1;
+
+	if ((imin == -1) || (imax == -1) ||
+		(jmin == -1) || (jmax == -1) ||
+		(kmin == -1) || (kmax == -1) ||
+		(imin > imax) || (jmin > jmax) || (kmin > kmax))
+	{
+		return false;
+	}
+
+	T *uout = NULL, *vout = NULL, *wout = NULL,
+		*cx = NULL, *cy = NULL, *cz = NULL;
+	if (grid.mpi_com.rank == 0) 
+	{
+		allocate_vnull(&uout, &vout, &wout, grid.mpi_size);
+		allocate_vnull(&cx, &cy, &cz, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(uout, uin, 0, axisXYZ);
+	grid.template mpi_gather<memCPU, mem>(vout, vin, 0, axisXYZ);
+	grid.template mpi_gather<memCPU, mem>(wout, win, 0, axisXYZ);
+	grid.template mpi_gather_center_coord<memCPU>(cx, cy, cz, 0);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		status = write_tecplot(filename, uout, vout, wout, cx, cy, cz,
+			grid.mpi_nx, grid.mpi_ny, grid.mpi_nz,
+			imin, imax,
+			jmin, jmax,
+			kmin, kmax,
+
+			"nse-3D", uname, vname, wname,
+			axis_name(axisX), axis_name(axisY), axis_name(axisZ), time);
+
+		deallocate(uout, vout, wout);
+		deallocate(cx, cy, cz);
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const T* xin, const char* name,
+	const T xmin, const T xmax,
+	const T ymin, const T ymax,
+	const T zmin, const T zmax,
+	const nse_const3d::nodeType node, const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_3d(append_index(filename, idx),
+		xin, name,
+		xmin, xmax, ymin, ymax, zmin, zmax,
+		node, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const T* uin, const T* vin, const T* win,
+	const char* uname, const char* vname, const char* wname,
+	const T xmin, const T xmax,
+	const T ymin, const T ymax,
+	const T zmin, const T zmax,
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_3d(append_index(filename, idx),
+		uin, vin, win, uname, vname, wname,
+		xmin, xmax, ymin, ymax, zmin, zmax,
+		grid, time);
+}
+// -------------------------------------------------------------------- //
+
+// * write tecplot-2d output, F(x,y) * //
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_2d(const std::string& filename,
+	const T* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[XY,XZ,YZ], node=[C,(U|V,U|W,V|W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	nse_const3d::axisType axisA, axisB;
+	nse_const3d::nodeType nodeStagA, nodeStagB, nodeEdge;
+	if (axis == axisXY) {
+		axisA = axisX; axisB = axisY;
+		nodeStagA = nodeU; nodeStagB = nodeV; nodeEdge = nodeUV;
+	}
+	if (axis == axisXZ) {
+		axisA = axisX; axisB = axisZ;
+		nodeStagA = nodeU; nodeStagB = nodeW; nodeEdge = nodeUW;
+	}
+	if (axis == axisYZ) {
+		axisA = axisY; axisB = axisZ;
+		nodeStagA = nodeV; nodeStagB = nodeW; nodeEdge = nodeVW;
+	}
+
+	if ((node != nodeC) &&
+		(node != nodeStagA) && (node != nodeStagB) && (node != nodeEdge))
+	{
+		return false;
+	}
+
+	int ndimx = grid.mpi_dim_size(axisA),
+		ndimy = grid.mpi_dim_size(axisB);
+	int ngx = grid.ghost_region_size(axisA),
+		ngy = grid.ghost_region_size(axisB);
+
+	T *out = NULL, *cx = NULL, *cy = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&out, ndimx * ndimy);
+		allocate_vnull(&cx, &cy, ndimx, ndimy);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(out, xin, 0, axis);
+	if ((node == nodeC) || (node == nodeStagB))
+		grid.template mpi_gather_center_coord<memCPU>(cx, 0, axisA);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(cx, 0, axisA);
+	if ((node == nodeC) || (node == nodeStagA))
+		grid.template mpi_gather_center_coord<memCPU>(cy, 0, axisB);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(cy, 0, axisB);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		const int shx = (((node == nodeStagA) || (node == nodeEdge)) && (ngx > 0)) ? 1 : 0;
+		const int shy = (((node == nodeStagB) || (node == nodeEdge)) && (ngy > 0)) ? 1 : 0;
+
+		status = write_tecplot(filename, out, cx, cy,
+			ndimx, ndimy,
+			ngx, ndimx - ngx + shx - 1,
+			ngy, ndimy - ngy + shy - 1,
+
+			"nse-3D", name, axis_name(axisA), axis_name(axisB), time);
+
+		deallocate(out); deallocate(cx, cy);
+	}
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_2d(const std::string& filename,
+	T** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[XY,XZ,YZ], node=[C,(U|V,U|W,V|W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	nse_const3d::axisType axisA, axisB;
+	nse_const3d::nodeType nodeStagA, nodeStagB, nodeEdge;
+	if (axis == axisXY) {
+		axisA = axisX; axisB = axisY;
+		nodeStagA = nodeU; nodeStagB = nodeV; nodeEdge = nodeUV;
+	}
+	if (axis == axisXZ) {
+		axisA = axisX; axisB = axisZ;
+		nodeStagA = nodeU; nodeStagB = nodeW; nodeEdge = nodeUW;
+	}
+	if (axis == axisYZ) {
+		axisA = axisY; axisB = axisZ;
+		nodeStagA = nodeV; nodeStagB = nodeW; nodeEdge = nodeVW;
+	}
+
+	if ((node != nodeC) &&
+		(node != nodeStagA) && (node != nodeStagB) && (node != nodeEdge))
+	{
+		return false;
+	}
+
+	int ndimx = grid.mpi_dim_size(axisA),
+		ndimy = grid.mpi_dim_size(axisB);
+	int ngx = grid.ghost_region_size(axisA),
+		ngy = grid.ghost_region_size(axisB);
+
+	T **out, *cx = NULL, *cy = NULL;
+	out = new T*[nvar];
+	if (grid.mpi_com.rank == 0) {
+		for (int k = 0; k < nvar; k++)
+			allocate_vnull(&out[k], ndimx * ndimy);
+		allocate_vnull(&cx, &cy, ndimx, ndimy);
+	}
+
+	for (int k = 0; k < nvar; k++)
+		grid.template mpi_gather<memCPU, mem>(out[k], xin[k], 0, axis);
+
+	if ((node == nodeC) || (node == nodeStagB))
+		grid.template mpi_gather_center_coord<memCPU>(cx, 0, axisA);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(cx, 0, axisA);
+	if ((node == nodeC) || (node == nodeStagA))
+		grid.template mpi_gather_center_coord<memCPU>(cy, 0, axisB);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(cy, 0, axisB);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		const int shx = (((node == nodeStagA) || (node == nodeEdge)) && (ngx > 0)) ? 1 : 0;
+		const int shy = (((node == nodeStagB) || (node == nodeEdge)) && (ngy > 0)) ? 1 : 0;
+
+		status = write_tecplot(filename, out, cx, cy, nvar,
+			ndimx, ndimy,
+			ngx, ndimx - ngx + shx - 1,
+			ngy, ndimy - ngy + shy - 1,
+
+			"nse-3D", name, axis_name(axisA), axis_name(axisB), time);
+
+		for (int k = 0; k < nvar; k++)
+			deallocate(out[k]); 
+		deallocate(cx, cy);
+	}
+	delete[] out;
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_2d(const std::string& filename,
+	const T* uin, const T* vin, const char* uname, const char* vname,
+	const nse_const3d::axisType axis,	// axis=[XY,XZ,YZ], node=[C]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	nse_const3d::axisType axisA, axisB;
+	if (axis == axisXY) {
+		axisA = axisX; axisB = axisY;
+	}
+	if (axis == axisXZ) {
+		axisA = axisX; axisB = axisZ;
+	}
+	if (axis == axisYZ) {
+		axisA = axisY; axisB = axisZ;
+	}
+
+	int ndimx = grid.mpi_dim_size(axisA),
+		ndimy = grid.mpi_dim_size(axisB);
+	int ngx = grid.ghost_region_size(axisA),
+		ngy = grid.ghost_region_size(axisB);
+
+	T *uout = NULL, *vout = NULL, *cx = NULL, *cy = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&uout, &vout, ndimx * ndimy);
+		allocate_vnull(&cx, &cy, ndimx, ndimy);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(uout, uin, 0, axis);
+	grid.template mpi_gather<memCPU, mem>(vout, vin, 0, axis);
+	grid.template mpi_gather_center_coord<memCPU>(cx, 0, axisA);
+	grid.template mpi_gather_center_coord<memCPU>(cy, 0, axisB);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		status = write_tecplot(filename, uout, vout, cx, cy,
+			ndimx, ndimy,
+			ngx, ndimx - ngx - 1,
+			ngy, ndimy - ngy - 1,
+
+			"nse-3D", uname, vname, axis_name(axisA), axis_name(axisB), time);
+
+		deallocate(uout, vout); deallocate(cx, cy);
+	}
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	const T* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[XY,XZ,YZ], node=[C,(U|V,U|W,V|W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_2d(append_index(filename, idx),
+		xin, name, axis, node, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	T** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[XY,XZ,YZ], node=[C,(U|V,U|W,V|W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_2d(append_index(filename, idx),
+		xin, name, nvar, axis, node, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	const T* uin, const T* vin, const char* uname, const char* vname,
+	const nse_const3d::axisType axis,	// axis=[XY,XZ,YZ], node=[C]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_2d(append_index(filename, idx),
+		uin, vin, uname, vname, axis, grid, time);
+}
+// ------------------------------------------------------------------------ //
+
+// * write tecplot-1d output, F(x) * //
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_1d(const std::string& filename,
+	const T* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z] node=[C,(U,V,W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	nse_const3d::nodeType nodeStag;
+	if (axis == axisX) nodeStag = nodeU;
+	if (axis == axisY) nodeStag = nodeV;
+	if (axis == axisZ) nodeStag = nodeW;
+
+	if ((node != nodeC) && (node != nodeStag)) return false;
+
+	int ndim = grid.mpi_dim_size(axis);
+	int ng = grid.ghost_region_size(axis);
+
+	T *out = NULL, *coord = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&out, ndim);
+		allocate_vnull(&coord, ndim);
+	}
+	
+	grid.template mpi_gather<memCPU, mem>(out, xin, 0, axis);
+	if (node == nodeC) grid.template mpi_gather_center_coord<memCPU>(coord, 0, axis);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(coord, 0, axis);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+		const int sh = ((node == nodeStag) && (ng > 0)) ? 1 : 0;
+
+		status = write_tecplot(filename, out, coord,
+			ndim, ng, ndim - ng + sh - 1,
+
+			"nse-3D", name, axis_name(axis), time);
+
+		deallocate(out); deallocate(coord);
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	const T* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z] node=[C,(U,V,W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_1d(append_index(filename, idx),
+		xin, name, axis, node, grid, time);
+}
+// -------------------------------------------------------------------- //
+
+// * write tecplot-1d scaled output, F(x) * //
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_1d(const std::string& filename,
+	const T* xin, const char* name,
+	const T vscale, const T cscale,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	nse_const3d::nodeType nodeStag;
+	if (axis == axisX) nodeStag = nodeU;
+	if (axis == axisY) nodeStag = nodeV;
+	if (axis == axisZ) nodeStag = nodeW;
+
+	if ((node != nodeC) && (node != nodeStag)) return false;
+
+	int ndim = grid.mpi_dim_size(axis);
+	int ng = grid.ghost_region_size(axis);
+
+	T *out = NULL, *coord = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&out, ndim);
+		allocate_vnull(&coord, ndim);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(out, xin, 0, axis);
+	if (node == nodeC) grid.template mpi_gather_center_coord<memCPU>(coord, 0, axis);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(coord, 0, axis);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+		const int sh = ((node == nodeStag) && (ng > 0)) ? 1 : 0;
+
+		status = write_tecplot(filename, out, coord, vscale, cscale,
+			ndim, ng, ndim - ng + sh - 1,
+
+			"nse-3D", name, axis_name(axis), time);
+
+		deallocate(out); deallocate(coord);
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	const T* xin, const char* name,
+	const T vscale, const T cscale,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_1d(append_index(filename, idx),
+		xin, name, vscale, cscale, axis, node, grid, time);
+}
+// -------------------------------------------------------------------- //
+
+// * write tecplot-1d output, F{i}(x), {i=1,nvar} * //
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_1d(const std::string& filename,
+	T** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	nse_const3d::nodeType nodeStag;
+	if (axis == axisX) nodeStag = nodeU;
+	if (axis == axisY) nodeStag = nodeV;
+	if (axis == axisZ) nodeStag = nodeW;
+
+	if ((nvar <= 0) ||
+		((node != nodeC) && (node != nodeStag))) return false;
+
+	int ndim = grid.mpi_dim_size(axis);
+	int ng = grid.ghost_region_size(axis);
+
+	T **out, *coord = NULL;
+	out = new T*[nvar];
+	if (grid.mpi_com.rank == 0) {
+		for (int k = 0; k < nvar; k++)
+			allocate_vnull(&out[k], ndim);
+		allocate_vnull(&coord, ndim);
+	}
+
+	for (int k = 0; k < nvar; k++)
+		grid.template mpi_gather<memCPU, mem>(out[k], xin[k], 0, axis);
+	if (node == nodeC) grid.template mpi_gather_center_coord<memCPU>(coord, 0, axis);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(coord, 0, axis);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+		const int sh = ((node == nodeStag) && (ng > 0)) ? 1 : 0;
+
+		status = write_tecplot(filename, out, coord, nvar,
+			ndim, ng, ndim - ng + sh - 1,
+
+			"nse-3D", name, axis_name(axis), time);
+
+		for (int k = 0; k < nvar; k++)
+			deallocate(out[k]);
+		deallocate(coord);
+	}
+	delete[] out;
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	T** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_1d(append_index(filename, idx),
+		xin, name, nvar, axis, node, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_1d(const std::string& filename,
+	T** xin, const std::string* name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	nse_const3d::nodeType nodeStag;
+	if (axis == axisX) nodeStag = nodeU;
+	if (axis == axisY) nodeStag = nodeV;
+	if (axis == axisZ) nodeStag = nodeW;
+
+	if ((nvar <= 0) ||
+		((node != nodeC) && (node != nodeStag))) return false;
+
+	int ndim = grid.mpi_dim_size(axis);
+	int ng = grid.ghost_region_size(axis);
+
+	T **out, *coord = NULL;
+	out = new T*[nvar];
+	if (grid.mpi_com.rank == 0) {
+		for (int k = 0; k < nvar; k++)
+			allocate_vnull(&out[k], ndim);
+		allocate_vnull(&coord, ndim);
+	}
+
+	for (int k = 0; k < nvar; k++)
+		grid.template mpi_gather<memCPU, mem>(out[k], xin[k], 0, axis);
+	if (node == nodeC) grid.template mpi_gather_center_coord<memCPU>(coord, 0, axis);
+	else
+		grid.template mpi_gather_edge_coord<memCPU>(coord, 0, axis);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+		const int sh = ((node == nodeStag) && (ng > 0)) ? 1 : 0;
+
+		status = write_tecplot(filename, out, coord, nvar,
+			ndim, ng, ndim - ng + sh - 1,
+
+			"nse-3D", name, axis_name(axis), time);
+
+		for (int k = 0; k < nvar; k++)
+			deallocate(out[k]);
+		deallocate(coord);
+	}
+	delete[] out;
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	T** xin, const std::string* name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_tecplot_1d(append_index(filename, idx),
+		xin, name, nvar, axis, node, grid, time);
+}
+// -------------------------------------------------------------------- //
+
+// * write binary-1d|2d|3d output, axis = n-dims, 1D - NOT supported, returns false * //
+template< nse::memType mem, typename T >
+bool nse::write_binary(const std::string& filename,
+	const T* xin, const char* name,
+	const nse_const3d::axisType axis,
+	const Grid3d< T, mem >& grid, const T time)
+{
+	if ((axis == nse_const3d::axisX) ||
+		(axis == nse_const3d::axisY) ||
+		(axis == nse_const3d::axisZ))
+	{
+		return false;
+		//return write_binary_1d(filename, xin, name, axis, grid, time);
+	}
+	if ((axis == nse_const3d::axisXY) ||
+		(axis == nse_const3d::axisXZ) ||
+		(axis == nse_const3d::axisYZ))
+	{
+		return write_binary_2d(filename, xin, name, axis, grid, time);
+	}
+
+	return write_binary_3d(filename, xin, name, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary(const std::string& filename, const int idx,
+	const T* xin, const char* name,
+	const nse_const3d::axisType axis,
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_binary(append_index(filename, idx),
+		xin, name, axis, grid, time);
+}
+// -------------------------------------------------------------------- //
+
+// * write binary-3D output * //
+template< nse::memType mem, typename T >
+bool nse::write_binary_stamp(const std::string& filename,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+	if (cpu_stamp.size <= 0) return false;
+
+	T *cx = NULL, *cy = NULL, *cz = NULL,
+		*ex = NULL, *ey = NULL, *ez = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&cx, &cy, &cz, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+		allocate_vnull(&ex, &ey, &ez, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+	}
+
+	grid.template mpi_gather_center_coord<memCPU>(cx, cy, cz, 0);
+	grid.template mpi_gather_edge_coord<memCPU>(ex, ey, ez, 0);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		GridId< T > id;
+		grid.set_id(id);
+
+		status = write_binary_stamp(filename,
+			index_stamp, cpu_stamp,
+
+			cx, cy, cz, ex, ey, ez,
+			id, time);
+
+		deallocate(cx, cy, cz); deallocate(ex, ey, ez);
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary_stamp(const std::string& filename,
+	const binNamedStamp< int >& index_stamp,
+	const binNamedStamp< double >& cpu_stamp,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+	if (cpu_stamp.get_size() <= 0) return false;
+
+	T *cx = NULL, *cy = NULL, *cz = NULL,
+		*ex = NULL, *ey = NULL, *ez = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&cx, &cy, &cz, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+		allocate_vnull(&ex, &ey, &ez, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+	}
+
+	grid.template mpi_gather_center_coord<memCPU>(cx, cy, cz, 0);
+	grid.template mpi_gather_edge_coord<memCPU>(ex, ey, ez, 0);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		GridId< T > id;
+		grid.set_id(id);
+
+		status = write_binary_stamp(filename,
+			index_stamp, cpu_stamp,
+
+			cx, cy, cz, ex, ey, ez,
+			id, time);
+
+		deallocate(cx, cy, cz); deallocate(ex, ey, ez);
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary_3d(const std::string& filename,
+	const T* xin, const char* name,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+#ifdef _NSE_MPI_IO	// forcing MPI-IO...
+	bool mpi_io_status = mpi_write_binary_3d(filename, _NSE_MPI_IO_DATAREP_DEFAULT,
+		xin, name, grid, time);
+#ifndef _NSE_MPI_IO_RETRY_SEQ
+	return mpi_io_status;
+#else
+	if (mpi_io_status) return true;
+#endif
+#endif
+
+#if (!defined(_NSE_MPI_IO) || \
+	(defined(_NSE_MPI_IO) && defined(_NSE_MPI_IO_RETRY_SEQ)))
+	T *out = NULL,
+		*cx = NULL, *cy = NULL, *cz = NULL,
+		*ex = NULL, *ey = NULL, *ez = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&out, grid.mpi_size);
+		allocate_vnull(&cx, &cy, &cz, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+		allocate_vnull(&ex, &ey, &ez, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(out, xin, 0, axisXYZ);
+	grid.template mpi_gather_center_coord<memCPU>(cx, cy, cz, 0);
+	grid.template mpi_gather_edge_coord<memCPU>(ex, ey, ez, 0);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		GridId< T > id;
+		grid.set_id(id);
+
+		status = write_binary(filename,
+			out, name,
+
+			cx, cy, cz, ex, ey, ez,
+			id, time);
+
+		deallocate(out);
+		deallocate(cx, cy, cz); deallocate(ex, ey, ez);
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+#endif
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary_3d(const std::string& filename,
+	const T* uin, const T* vin, const T* win,
+	const char* uname, const char* vname, const char* wname,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+#ifdef _NSE_MPI_IO	// forcing MPI-IO...
+	bool mpi_io_status = mpi_write_binary_3d(filename, _NSE_MPI_IO_DATAREP_DEFAULT,
+		uin, vin, win, uname, vname, wname, grid, time);
+#ifndef _NSE_MPI_IO_RETRY_SEQ
+	return mpi_io_status;
+#else
+	if (mpi_io_status) return true;
+#endif
+#endif
+
+#if (!defined(_NSE_MPI_IO) || \
+	(defined(_NSE_MPI_IO) && defined(_NSE_MPI_IO_RETRY_SEQ)))
+	T *uout = NULL, *vout = NULL, *wout = NULL,
+		*cx = NULL, *cy = NULL, *cz = NULL,
+		*ex = NULL, *ey = NULL, *ez = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&uout, &vout, &wout, grid.mpi_size);
+		allocate_vnull(&cx, &cy, &cz, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+		allocate_vnull(&ex, &ey, &ez, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(uout, uin, 0, axisXYZ);
+	grid.template mpi_gather<memCPU, mem>(vout, vin, 0, axisXYZ);
+	grid.template mpi_gather<memCPU, mem>(wout, win, 0, axisXYZ);
+	grid.template mpi_gather_center_coord<memCPU>(cx, cy, cz, 0);
+	grid.template mpi_gather_edge_coord<memCPU>(ex, ey, ez, 0);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		GridId< T > id;
+		grid.set_id(id);
+
+		status = write_binary(filename,
+			uout, vout, wout, uname, vname, wname,
+
+			cx, cy, cz, ex, ey, ez,
+			id, time);
+
+		deallocate(uout, vout, wout);
+		deallocate(cx, cy, cz); deallocate(ex, ey, ez);
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+#endif
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary_stamp(const std::string& filename, const int idx,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_binary_stamp(append_index(filename, idx),
+		index_stamp, cpu_stamp, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary_stamp(const std::string& filename, const int idx,
+	const binNamedStamp< int >& index_stamp,
+	const binNamedStamp< double >& cpu_stamp,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_binary_stamp(append_index(filename, idx),
+		index_stamp, cpu_stamp, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary_3d(const std::string& filename, const int idx,
+	const T* xin, const char* name,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_binary_3d(append_index(filename, idx),
+		xin, name, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary_3d(const std::string& filename, const int idx,
+	const T* uin, const T* vin, const T* win,
+	const char* uname, const char* vname, const char* wname,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return write_binary_3d(append_index(filename, idx),
+		uin, vin, win, uname, vname, wname, grid, time);
+}
+// ------------------------------------------------------------------------ //
+
+
+// * write binary-2D output * //
+template< nse::memType mem, typename T >
+bool nse::write_binary_2d(const std::string& filename,
+	const T* xin, const char* name,
+
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time)
+{
+#ifdef _NSE_MPI_IO	// forcing MPI-IO...
+	bool mpi_io_status = mpi_write_binary_2d(filename, _NSE_MPI_IO_DATAREP_DEFAULT,
+		xin, name, axis, grid, time);
+#ifndef _NSE_MPI_IO_RETRY_SEQ
+	return mpi_io_status;
+#else
+	if (mpi_io_status) return true;
+#endif
+#endif
+
+#if (!defined(_NSE_MPI_IO) || \
+	(defined(_NSE_MPI_IO) && defined(_NSE_MPI_IO_RETRY_SEQ)))
+
+	nse_const3d::axisType axisA, axisB;
+	if (axis == axisXY) {
+		axisA = axisX; axisB = axisY;
+	}
+	if (axis == axisXZ) {
+		axisA = axisX; axisB = axisZ;
+	}
+	if (axis == axisYZ) {
+		axisA = axisY; axisB = axisZ;
+	}
+
+	int ndimx = grid.mpi_dim_size(axisA),
+		ndimy = grid.mpi_dim_size(axisB);
+
+	T *out = NULL,
+		*cx = NULL, *cy = NULL,
+		*ex = NULL, *ey = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&out, ndimx * ndimy);
+		allocate_vnull(&cx, &cy, ndimx, ndimy);
+		allocate_vnull(&ex, &ey, ndimx, ndimy);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(out, xin, 0, axis);
+	grid.template mpi_gather_center_coord<memCPU>(cx, 0, axisA);
+	grid.template mpi_gather_center_coord<memCPU>(cy, 0, axisB);
+	grid.template mpi_gather_edge_coord<memCPU>(ex, 0, axisA);
+	grid.template mpi_gather_edge_coord<memCPU>(ey, 0, axisB);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		GridId< T > id;
+		grid.set_id(id, axis);
+
+		status = write_binary(filename,
+			out, name,
+
+			cx, cy, ex, ey,
+			id, time);
+
+		deallocate(out);
+		deallocate(cx, cy); deallocate(ex, ey);
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+#endif
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary_2d(const std::string& filename,
+	const T* uin, const T* vin,
+	const char* uname, const char* vname,
+
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time)
+{
+#ifdef _NSE_MPI_IO	// forcing MPI-IO...
+	bool mpi_io_status = mpi_write_binary_2d(filename, _NSE_MPI_IO_DATAREP_DEFAULT,
+		uin, vin, uname, vname, axis, grid, time);
+#ifndef _NSE_MPI_IO_RETRY_SEQ
+	return mpi_io_status;
+#else
+	if (mpi_io_status) return true;
+#endif
+#endif
+
+#if (!defined(_NSE_MPI_IO) || \
+	(defined(_NSE_MPI_IO) && defined(_NSE_MPI_IO_RETRY_SEQ)))
+
+	nse_const3d::axisType axisA, axisB;
+	if (axis == axisXY) {
+		axisA = axisX; axisB = axisY;
+	}
+	if (axis == axisXZ) {
+		axisA = axisX; axisB = axisZ;
+	}
+	if (axis == axisYZ) {
+		axisA = axisY; axisB = axisZ;
+	}
+
+	int ndimx = grid.mpi_dim_size(axisA),
+		ndimy = grid.mpi_dim_size(axisB);
+	int ngx = grid.ghost_region_size(axisA),
+		ngy = grid.ghost_region_size(axisB);
+
+
+	T *uout = NULL, *vout = NULL,
+		*cx = NULL, *cy = NULL,
+		*ex = NULL, *ey = NULL;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&uout, &vout, ndimx * ndimy);
+		allocate_vnull(&cx, &cy, ndimx, ndimy);
+		allocate_vnull(&ex, &ey, ndimx, ndimy);
+	}
+
+	grid.template mpi_gather<memCPU, mem>(uout, uin, 0, axis);
+	grid.template mpi_gather<memCPU, mem>(vout, vin, 0, axis);
+	grid.template mpi_gather_center_coord<memCPU>(cx, 0, axisA);
+	grid.template mpi_gather_center_coord<memCPU>(cy, 0, axisB);
+	grid.template mpi_gather_edge_coord<memCPU>(ex, 0, axisA);
+	grid.template mpi_gather_edge_coord<memCPU>(ey, 0, axisB);
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		GridId< T > id;
+		grid.set_id(id, axis);
+
+		status = write_binary(filename,
+			uout, vout, uname, vname,
+
+			cx, cy, ex, ey,
+			id, time);
+
+		deallocate(uout, vout);
+		deallocate(cx, cy); deallocate(ex, ey);
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+#endif
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary_2d(const std::string& filename, const int idx,
+	const T* xin, const char* name,
+
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time)
+{
+	return write_binary_2d(append_index(filename, idx),
+		xin, name, axis, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::write_binary_2d(const std::string& filename, const int idx,
+	const T* uin, const T* vin,
+	const char* uname, const char* vname,
+
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time)
+{
+	return write_binary_2d(append_index(filename, idx),
+		uin, vin, uname, vname, axis, grid, time);
+}
+// ------------------------------------------------------------------------ //
+
+// * read binary-3D input * //
+template< nse::memType mem, typename T >
+bool nse::read_binary_stamp(const std::string& filename,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+
+	const Grid3d< T, mem >& grid, T* time)
+{
+	GridId< T > id;
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		T *cx, *cy, *cz, *ex, *ey, *ez;
+
+		status = read_binary_stamp(filename,
+			index_stamp, cpu_stamp,
+
+			&cx, &cy, &cz, &ex, &ey, &ez,
+			id, time);
+
+		if (status) {
+			deallocate(cx, cy, cz); deallocate(ex, ey, ez);
+		}
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	if (status == 1) {
+		MPI_Bcast(time, 1, mpi_type< T >(), 0, grid.mpi_com.comm);
+
+		index_stamp.mpi_broadcast(0, grid.mpi_com.comm);
+		cpu_stamp.mpi_broadcast(0, grid.mpi_com.comm);
+	}
+
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_stamp(const std::string& filename,
+	binNamedStamp< int >& index_stamp,
+	binNamedStamp< double >& cpu_stamp,
+
+	const Grid3d< T, mem >& grid, T* time)
+{
+	GridId< T > id;
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		T *cx, *cy, *cz, *ex, *ey, *ez;
+
+		status = read_binary_stamp(filename,
+			index_stamp, cpu_stamp,
+
+			&cx, &cy, &cz, &ex, &ey, &ez,
+			id, time);
+
+		if (status) {
+			deallocate(cx, cy, cz); deallocate(ex, ey, ez);
+		}
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	if (status == 1) {
+		MPI_Bcast(time, 1, mpi_type< T >(), 0, grid.mpi_com.comm);
+
+		index_stamp.mpi_broadcast(0, grid.mpi_com.comm);
+		cpu_stamp.mpi_broadcast(0, grid.mpi_com.comm);
+	}
+
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_3d(
+	const std::string& filename,
+	T* xout,
+	const Grid3d< T, mem >& grid)
+{
+	return read_binary_3d(filename, xout, nodeC, grid);
+}
+
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_3d(
+	const std::string& filename,
+	T* xout,
+	const nse_const3d::nodeType node, const Grid3d< T, mem >& grid)
+{
+#ifdef _NSE_MPI_IO	// forcing MPI-IO...
+	bool mpi_io_status = mpi_read_binary_3d(filename, _NSE_MPI_IO_DATAREP_DEFAULT,
+		xout, nodeC, grid);
+#ifndef _NSE_MPI_IO_RETRY_SEQ
+	return mpi_io_status;
+#else
+	if (mpi_io_status) return true;
+#endif
+#endif
+
+#if (!defined(_NSE_MPI_IO) || \
+	(defined(_NSE_MPI_IO) && defined(_NSE_MPI_IO_RETRY_SEQ)))
+	T *xin;
+	GridId< T > id;
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		char* name;
+		T time_stamp;
+		T *cx, *cy, *cz, *ex, *ey, *ez;
+
+		status = read_binary(filename,
+			&xin, &name,
+
+			&cx, &cy, &cz, &ex, &ey, &ez,
+			id, &time_stamp);
+
+		if (status) {
+			deallocate(cx, cy, cz); deallocate(ex, ey, ez);
+			delete[] name;
+		}
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	if (status == 1) {	// read status - OK -
+
+		id.mpi_broadcast(0, grid.mpi_com.comm);
+
+		if (grid.check_id_dims(id))
+			grid.template mpi_scatter<mem, memCPU>(xout, xin, 0, axisXYZ);
+		else
+		{
+			int nxin, nyin, nzin, gcxin, gcyin, gczin;
+			id.grid_dim(1, &nxin, &gcxin);
+			id.grid_dim(2, &nyin, &gcyin);
+			id.grid_dim(3, &nzin, &gczin);
+
+			int pnx = par_local_size(nxin, gcxin,
+				grid.mpi_com.rank_x, grid.mpi_com.size_x);
+			int pny = par_local_size(nyin, gcyin,
+				grid.mpi_com.rank_y, grid.mpi_com.size_y);
+			int pnz = par_local_size(nzin, gczin,
+				grid.mpi_com.rank_z, grid.mpi_com.size_z);
+
+			T *xpin;
+			allocate_vnull(&xpin, pnx * pny * pnz);
+
+			grid.mpi_com.scatter(xpin, xin, 0, pnx, pny, pnz, gcxin, gcyin, gczin);
+			grid.grid_reinterp(xout, xpin, node, id);	// - grid-grid(input) interpolation
+
+			deallocate(xpin);
+		}
+
+		if (grid.mpi_com.rank == 0) deallocate(xin);
+	}
+
+	return (status == 1);
+#endif
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_3d(const std::string& filename,
+	T* uout, T* vout, T* wout,
+	const Grid3d< T, mem >& grid)
+{
+#ifdef _NSE_MPI_IO	// forcing MPI-IO...
+	bool mpi_io_status = mpi_read_binary_3d(filename, _NSE_MPI_IO_DATAREP_DEFAULT,
+		uout, vout, wout, grid);
+#ifndef _NSE_MPI_IO_RETRY_SEQ
+	return mpi_io_status;
+#else
+	if (mpi_io_status) return true;
+#endif
+#endif
+
+#if (!defined(_NSE_MPI_IO) || \
+	(defined(_NSE_MPI_IO) && defined(_NSE_MPI_IO_RETRY_SEQ)))
+	T *uin, *vin, *win;
+	GridId< T > id;
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		char *uname, *vname, *wname;
+		T time_stamp;
+		T *cx, *cy, *cz, *ex, *ey, *ez;
+
+		status = read_binary(filename,
+			&uin, &vin, &win, &uname, &vname, &wname,
+
+			&cx, &cy, &cz, &ex, &ey, &ez,
+			id, &time_stamp);
+
+		if (status) {
+			deallocate(cx, cy, cz); deallocate(ex, ey, ez);
+			delete[] uname; delete[] vname; delete[] wname;
+		}
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	if (status == 1) {	// read status - OK -
+
+		id.mpi_broadcast(0, grid.mpi_com.comm);
+
+		if (grid.check_id_dims(id)) {
+			grid.template mpi_scatter<mem, memCPU>(uout, uin, 0, axisXYZ);
+			grid.template mpi_scatter<mem, memCPU>(vout, vin, 0, axisXYZ);
+			grid.template mpi_scatter<mem, memCPU>(wout, win, 0, axisXYZ);
+		}
+		else
+		{
+			int nxin, nyin, nzin, gcxin, gcyin, gczin;
+			id.grid_dim(1, &nxin, &gcxin);
+			id.grid_dim(2, &nyin, &gcyin);
+			id.grid_dim(3, &nzin, &gczin);
+
+			int pnx = par_local_size(nxin, gcxin,
+				grid.mpi_com.rank_x, grid.mpi_com.size_x);
+			int pny = par_local_size(nyin, gcyin,
+				grid.mpi_com.rank_y, grid.mpi_com.size_y);
+			int pnz = par_local_size(nzin, gczin,
+				grid.mpi_com.rank_z, grid.mpi_com.size_z);
+
+			T *upin, *vpin, *wpin;
+			allocate_vnull(&upin, &vpin, &wpin, pnx * pny * pnz);
+
+			grid.mpi_com.scatter(upin, uin, 0, pnx, pny, pnz, gcxin, gcyin, gczin);
+			grid.mpi_com.scatter(vpin, vin, 0, pnx, pny, pnz, gcxin, gcyin, gczin);
+			grid.mpi_com.scatter(wpin, win, 0, pnx, pny, pnz, gcxin, gcyin, gczin);
+			grid.grid_reinterp(uout, upin, nodeU, id);	// -U grid-grid(input) interpolation
+			grid.grid_reinterp(vout, vpin, nodeV, id);	// -V grid-grid(input) interpolation
+			grid.grid_reinterp(wout, wpin, nodeW, id);	// -W grid-grid(input) interpolation
+
+			deallocate(upin, vpin, wpin);
+		}
+
+		if (grid.mpi_com.rank == 0) deallocate(uin, vin, win);
+	}
+
+	return (status == 1);
+#endif
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_stamp(const std::string& filename, const int idx,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+
+	const Grid3d< T, mem >& grid, T* time)
+{
+	return read_binary_stamp(append_index(filename, idx),
+		index_stamp, cpu_stamp, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_stamp(const std::string& filename, const int idx,
+	binNamedStamp< int >& index_stamp,
+	binNamedStamp< double >& cpu_stamp,
+
+	const Grid3d< T, mem >& grid, T* time)
+{
+	return read_binary_stamp(append_index(filename, idx),
+		index_stamp, cpu_stamp, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_3d(const std::string& filename, const int idx,
+	T* xout,
+	const Grid3d< T, mem >& grid)
+{
+	return read_binary_3d(append_index(filename, idx),
+		xout, grid);
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_3d(const std::string& filename, const int idx,
+	T* xout,
+	const nse_const3d::nodeType node, const Grid3d< T, mem >& grid)
+{
+	return read_binary_3d(append_index(filename, idx),
+		xout, node, grid);
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_3d(const std::string& filename, const int idx,
+	T* uout, T* vout, T* wout,
+	const Grid3d< T, mem >& grid)
+{
+	return read_binary_3d(append_index(filename, idx),
+		uout, vout, wout, grid);
+}
+// ------------------------------------------------------------------------ //
+
+
+// * read binary-2D input * //
+template< nse::memType mem, typename T >
+bool nse::read_binary_2d(
+	const std::string& filename,
+	T* xout,
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid)
+{
+	return read_binary_2d(filename, xout, nodeC, axis, grid);
+}
+
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_2d(
+	const std::string& filename,
+	T* xout,
+	const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< T, mem >& grid)
+{
+	T *xin;
+	GridId< T > id;
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		char* name;
+		T time_stamp;
+		T *cx, *cy, *ex, *ey;
+
+		status = read_binary(filename,
+			&xin, &name,
+
+			&cx, &cy, &ex, &ey,
+			id, &time_stamp);
+
+		if (status) {
+			deallocate(cx, cy); deallocate(ex, ey);
+			delete[] name;
+		}
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	if (status == 1) {	// read status - OK -
+
+		id.mpi_broadcast(0, grid.mpi_com.comm);
+
+		if (grid.check_id_dims(id, axis))
+			grid.template mpi_scatter<mem, memCPU>(xout, xin, 0, axis);
+		else
+		{
+			status = 1;
+		}
+
+		if (grid.mpi_com.rank == 0) deallocate(xin);
+	}
+
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_2d(const std::string& filename,
+	T* uout, T* vout,
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid)
+{
+	T *uin, *vin;
+	GridId< T > id;
+
+	int status = 0;
+	if (grid.mpi_com.rank == 0) {
+
+		char *uname, *vname;
+		T time_stamp;
+		T *cx, *cy, *ex, *ey;
+
+		status = read_binary(filename,
+			&uin, &vin, &uname, &vname,
+
+			&cx, &cy, &ex, &ey,
+			id, &time_stamp);
+
+		if (status) {
+			deallocate(cx, cy); deallocate(ex, ey);
+			delete[] uname; delete[] vname;
+		}
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	if (status == 1) {	// read status - OK -
+
+		id.mpi_broadcast(0, grid.mpi_com.comm);
+
+		if (grid.check_id_dims(id, axis)) {
+			grid.template mpi_scatter<mem, memCPU>(uout, uin, 0, axis);
+			grid.template mpi_scatter<mem, memCPU>(vout, vin, 0, axis);
+		}
+		else
+		{
+			status = 0;
+		}
+
+		if (grid.mpi_com.rank == 0) deallocate(uin, vin);
+	}
+
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_2d(const std::string& filename, const int idx,
+	T* xout,
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid)
+{
+	return read_binary_2d(append_index(filename, idx),
+		xout, axis, grid);
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_2d(const std::string& filename, const int idx,
+	T* xout,
+	const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< T, mem >& grid)
+{
+	return read_binary_2d(append_index(filename, idx),
+		xout, node, axis, grid);
+}
+
+template< nse::memType mem, typename T >
+bool nse::read_binary_2d(const std::string& filename, const int idx,
+	T* uout, T* vout,
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid)
+{
+	return read_binary_2d(append_index(filename, idx),
+		uout, vout, axis, grid);
+}
+// ------------------------------------------------------------------------ //
+
+// * MPI-write binary-3D output * //
+template< nse::memType mem, typename T >
+bool nse::mpi_write_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	const T* xin, const char* name,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+	GridId< T > id;
+	grid.set_id(id);	// set id on all processors //
+
+	T *cx, *cy, *cz, *ex, *ey, *ez;
+	T *x;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&cx, &cy, &cz, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+		allocate_vnull(&ex, &ey, &ez, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+	}
+
+	grid.template mpi_gather_center_coord<memCPU>(cx, cy, cz, 0);
+	grid.template mpi_gather_edge_coord<memCPU>(ex, ey, ez, 0);
+	if (mem == memGPU) {
+		allocate_vnull(&x, grid.size);
+		mcopy<memCPU, mem>(x, xin, grid.size);
+	}
+	else
+		x = (T*)xin;
+
+	int status = mpi_write_binary(filename,
+		mpi_datarep,
+		grid.mpi_com.comm, 0,
+		grid.mpi_com.comm_x, grid.mpi_com.comm_y, grid.mpi_com.comm_z,
+
+		x, name,
+
+		cx, cy, cz, ex, ey, ez,
+		id, time);
+
+	if (mem == memGPU) deallocate(x);
+	if (grid.mpi_com.rank == 0) {
+		deallocate(cx, cy, cz); deallocate(ex, ey, ez);
+	}
+
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_write_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	const T* uin, const T* vin, const T* win,
+	const char* uname, const char* vname, const char* wname,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+	GridId< T > id;
+	grid.set_id(id);	// set id on all processors //
+
+	T *cx, *cy, *cz, *ex, *ey, *ez;
+	T *u, *v, *w;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&cx, &cy, &cz, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+		allocate_vnull(&ex, &ey, &ez, grid.mpi_nx, grid.mpi_ny, grid.mpi_nz);
+	}
+
+	grid.template mpi_gather_center_coord<memCPU>(cx, cy, cz, 0);
+	grid.template mpi_gather_edge_coord<memCPU>(ex, ey, ez, 0);
+	if (mem == memGPU) {
+		allocate_vnull(&u, &v, &w, grid.size);
+		mcopy<memCPU, mem>(u, uin, grid.size);
+		mcopy<memCPU, mem>(v, vin, grid.size);
+		mcopy<memCPU, mem>(w, win, grid.size);
+	}
+	else
+	{
+		u = (T*)uin;
+		v = (T*)vin;
+		w = (T*)win;
+	}
+
+	int status = mpi_write_binary(filename,
+		mpi_datarep,
+		grid.mpi_com.comm, 0,
+		grid.mpi_com.comm_x, grid.mpi_com.comm_y, grid.mpi_com.comm_z,
+
+		u, v, w, uname, vname, wname,
+
+		cx, cy, cz, ex, ey, ez,
+		id, time);
+
+	if (mem == memGPU) deallocate(u, v, w);
+	if (grid.mpi_com.rank == 0) {
+		deallocate(cx, cy, cz); deallocate(ex, ey, ez);
+	}
+
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_write_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const T* xin, const char* name,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return mpi_write_binary_3d(append_index(filename, idx),
+		mpi_datarep,
+		xin, name, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_write_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const T* uin, const T* vin, const T* win,
+	const char* uname, const char* vname, const char* wname,
+
+	const Grid3d< T, mem >& grid, const T time)
+{
+	return mpi_write_binary_3d(append_index(filename, idx),
+		mpi_datarep,
+		uin, vin, win, uname, vname, wname, grid, time);
+}
+// ------------------------------------------------------------------------ //
+
+
+// * MPI-write binary-2D output * //
+template< nse::memType mem, typename T >
+bool nse::mpi_write_binary_2d(const std::string& filename,
+	const char* mpi_datarep,
+	const T* xin, const char* name,
+
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time)
+{
+	GridId< T > id;
+	grid.set_id(id, axis);	// set id on all processors //
+
+	MPI_Comm comm_xy, comm_x, comm_y;
+	nse_const3d::axisType axisA, axisB;
+	int is_mpi_host_plane = 0;
+	if (axis == axisXY) {
+		axisA = axisX; axisB = axisY;
+		comm_xy = grid.mpi_com.comm_xy;
+		comm_x = grid.mpi_com.comm_x;
+		comm_y = grid.mpi_com.comm_y;
+		is_mpi_host_plane = (grid.mpi_com.rank_z == 0);
+	}
+	if (axis == axisXZ) {
+		axisA = axisX; axisB = axisZ;
+		comm_xy = grid.mpi_com.comm_xz;
+		comm_x = grid.mpi_com.comm_x;
+		comm_y = grid.mpi_com.comm_z;
+		is_mpi_host_plane = (grid.mpi_com.rank_y == 0);
+	}
+	if (axis == axisYZ) {
+		axisA = axisY; axisB = axisZ;
+		comm_xy = grid.mpi_com.comm_yz;
+		comm_x = grid.mpi_com.comm_y;
+		comm_y = grid.mpi_com.comm_z;
+		is_mpi_host_plane = (grid.mpi_com.rank_x == 0);
+	}
+
+	int ndimx = grid.mpi_dim_size(axisA),
+		ndimy = grid.mpi_dim_size(axisB);
+
+	T *cx, *cy, *ex, *ey;
+	T *x;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&cx, &cy, ndimx, ndimy);
+		allocate_vnull(&ex, &ey, ndimx, ndimy);
+	}
+
+	grid.template mpi_gather_center_coord<memCPU>(cx, 0, axisA);
+	grid.template mpi_gather_center_coord<memCPU>(cy, 0, axisB);
+	grid.template mpi_gather_edge_coord<memCPU>(ex, 0, axisA);
+	grid.template mpi_gather_edge_coord<memCPU>(ey, 0, axisB);
+	if (mem == memGPU) {
+		allocate_vnull(&x, 
+			grid.dim_size(axisA) * grid.dim_size(axisB));
+		mcopy<memCPU, mem>(x, xin, 
+			grid.dim_size(axisA) * grid.dim_size(axisB));
+	}
+	else
+		x = (T*)xin;
+
+	int status = 0;
+	if (is_mpi_host_plane) {
+		status = mpi_write_binary(filename,
+			mpi_datarep,
+			comm_xy, 0,
+			comm_x, comm_y,
+
+			x, name,
+
+			cx, cy, ex, ey,
+			id, time);
+	}
+	mpi_allreduce(&status, MPI_MAX, grid.mpi_com.comm);
+
+	if (mem == memGPU) deallocate(x);
+	if (grid.mpi_com.rank == 0) {
+		deallocate(cx, cy); deallocate(ex, ey);
+	}
+
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_write_binary_2d(const std::string& filename,
+	const char* mpi_datarep,
+	const T* uin, const T* vin,
+	const char* uname, const char* vname,
+
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time)
+{
+	GridId< T > id;
+	grid.set_id(id, axis);	// set id on all processors //
+
+	MPI_Comm comm_xy, comm_x, comm_y;
+	nse_const3d::axisType axisA, axisB;
+	int is_mpi_host_plane = 0;
+	if (axis == axisXY) {
+		axisA = axisX; axisB = axisY;
+		comm_xy = grid.mpi_com.comm_xy;
+		comm_x = grid.mpi_com.comm_x;
+		comm_y = grid.mpi_com.comm_y;
+		is_mpi_host_plane = (grid.mpi_com.rank_z == 0);
+	}
+	if (axis == axisXZ) {
+		axisA = axisX; axisB = axisZ;
+		comm_xy = grid.mpi_com.comm_xz;
+		comm_x = grid.mpi_com.comm_x;
+		comm_y = grid.mpi_com.comm_z;
+		is_mpi_host_plane = (grid.mpi_com.rank_y == 0);
+	}
+	if (axis == axisYZ) {
+		axisA = axisY; axisB = axisZ;
+		comm_xy = grid.mpi_com.comm_yz;
+		comm_x = grid.mpi_com.comm_y;
+		comm_y = grid.mpi_com.comm_z;
+		is_mpi_host_plane = (grid.mpi_com.rank_x == 0);
+	}
+
+	int ndimx = grid.mpi_dim_size(axisA),
+		ndimy = grid.mpi_dim_size(axisB);
+
+	T *cx, *cy, *ex, *ey;
+	T *u, *v;
+	if (grid.mpi_com.rank == 0) {
+		allocate_vnull(&cx, &cy, ndimx, ndimy);
+		allocate_vnull(&ex, &ey, ndimx, ndimy);
+	}
+
+	grid.template mpi_gather_center_coord<memCPU>(cx, 0, axisA);
+	grid.template mpi_gather_center_coord<memCPU>(cy, 0, axisB);
+	grid.template mpi_gather_edge_coord<memCPU>(ex, 0, axisA);
+	grid.template mpi_gather_edge_coord<memCPU>(ey, 0, axisB);
+	if (mem == memGPU) {
+		allocate_vnull(&u, &v, 
+			grid.dim_size(axisA) * grid.dim_size(axisB));
+		mcopy<memCPU, mem>(u, uin, 
+			grid.dim_size(axisA) * grid.dim_size(axisB));
+		mcopy<memCPU, mem>(v, vin, 
+			grid.dim_size(axisA) * grid.dim_size(axisB));
+	}
+	else
+	{
+		u = (T*)uin;
+		v = (T*)vin;
+	}
+
+	int status = 0;
+	if (is_mpi_host_plane) {
+		status = mpi_write_binary(filename,
+			mpi_datarep,
+			comm_xy, 0,
+			comm_x, comm_y,
+
+			u, v, uname, vname,
+
+			cx, cy, ex, ey,
+			id, time);
+	}
+	mpi_allreduce(&status, MPI_MAX, grid.mpi_com.comm);
+
+	if (mem == memGPU) deallocate(u, v);
+	if (grid.mpi_com.rank == 0) {
+		deallocate(cx, cy); deallocate(ex, ey);
+	}
+
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_write_binary_2d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const T* xin, const char* name,
+
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time)
+{
+	return mpi_write_binary_2d(append_index(filename, idx),
+		mpi_datarep,
+		xin, name, axis, grid, time);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_write_binary_2d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const T* uin, const T* vin,
+	const char* uname, const char* vname,
+
+	const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time)
+{
+	return mpi_write_binary_2d(append_index(filename, idx),
+		mpi_datarep,
+		uin, vin, uname, vname, axis, grid, time);
+}
+// ------------------------------------------------------------------------ //
+
+
+// * MPI-read binary-3D input * //
+template< nse::memType mem, typename T >
+bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	T* xout,
+	const Grid3d< T, mem >& grid)
+{
+	return mpi_read_binary_3d(filename, mpi_datarep, 
+		xout, nodeC, grid);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	T* xout,
+	const nse_const3d::nodeType node, const Grid3d< T, mem >& grid)
+{
+	GridId< T > id;		// get id on all processors //
+	T *xin;				// distributed array //
+
+	char* name;			// data on host-rank //
+	T *cx, *cy, *cz, *ex, *ey, *ez;
+	T time_stamp;
+
+	int status = mpi_read_binary(filename,
+		mpi_datarep,
+		grid.mpi_com.comm, 0,
+		grid.mpi_com.comm_x, grid.mpi_com.comm_y, grid.mpi_com.comm_z,
+
+		&xin, &name,
+
+		&cx, &cy, &cz, &ex, &ey, &ez,
+		id, &time_stamp);
+	if (!status) return false;	// nothing to deallocate //
+
+	if (grid.check_id_dims(id)) {
+		mcopy<mem, memCPU>(xout, xin, grid.size);
+	}
+	else
+	{
+		grid.grid_reinterp(xout, xin, node, id);	// - grid-grid(input) interpolation
+	}
+
+	deallocate(xin);
+	if (grid.mpi_com.rank == 0) {
+		deallocate(cx, cy, cz);
+		deallocate(ex, ey, ez);
+		delete[] name;
+	}
+
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	T* uout, T* vout, T* wout,
+	const Grid3d< T, mem >& grid)
+{
+	GridId< T > id;			// get id on all processors //
+	T *uin, *vin, *win;		// distributed arrays //
+
+	char *uname, *vname, *wname;	// data on host-rank //
+	T *cx, *cy, *cz, *ex, *ey, *ez;
+	T time_stamp;
+
+	int status = mpi_read_binary(filename,
+		mpi_datarep,
+		grid.mpi_com.comm, 0,
+		grid.mpi_com.comm_x, grid.mpi_com.comm_y, grid.mpi_com.comm_z,
+
+		&uin, &vin, &win, &uname, &vname, &wname,
+
+		&cx, &cy, &cz, &ex, &ey, &ez,
+		id, &time_stamp);
+	if (!status) return false;	// nothing to deallocate //
+
+	if (grid.check_id_dims(id)) {
+		mcopy<mem, memCPU>(uout, uin, grid.size);
+		mcopy<mem, memCPU>(vout, vin, grid.size);
+		mcopy<mem, memCPU>(wout, win, grid.size);
+	}
+	else
+	{
+		grid.grid_reinterp(uout, uin, nodeU, id);	// -U grid-grid(input) interpolation
+		grid.grid_reinterp(vout, vin, nodeV, id);	// -V grid-grid(input) interpolation
+		grid.grid_reinterp(wout, win, nodeW, id);	// -W grid-grid(input) interpolation
+	}
+
+	deallocate(uin, vin, win);
+	if (grid.mpi_com.rank == 0) {
+		deallocate(cx, cy, cz);
+		deallocate(ex, ey, ez);
+		delete[] uname; delete[] vname; delete[] wname;
+	}
+
+	return (status == 1);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	T* xout,
+	const Grid3d< T, mem >& grid)
+{
+	return mpi_read_binary_3d(append_index(filename, idx),
+		mpi_datarep,
+		xout, grid);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	T* xout,
+	const nse_const3d::nodeType node, const Grid3d< T, mem >& grid)
+{
+	return mpi_read_binary_3d(append_index(filename, idx),
+		mpi_datarep,
+		xout, node, grid);
+}
+
+template< nse::memType mem, typename T >
+bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	T* uout, T* vout, T* wout,
+	const Grid3d< T, mem >& grid)
+{
+	return mpi_read_binary_3d(append_index(filename, idx),
+		mpi_datarep,
+		uout, vout, wout, grid);
+}
+// ------------------------------------------------------------------------ //
+
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+
+// * write tecplot-1d|2d|3d output, axis = n-dims * //
+template bool nse::write_tecplot(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * intialize:  write tecplot-3d output, F(x,y,z) * //
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write tecplot-3d output [sub-domain], F(x,y,z) * //
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const float* xin, const char* name,
+	const float xmin, const float xmax,
+	const float ymin, const float ymax,
+	const float zmin, const float zmax,
+	const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const double* xin, const char* name,
+	const double xmin, const double xmax,
+	const double ymin, const double ymax,
+	const double zmin, const double zmax,
+	const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const float xmin, const float xmax,
+	const float ymin, const float ymax,
+	const float zmin, const float zmax,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const double xmin, const double xmax,
+	const double ymin, const double ymax,
+	const double zmin, const double zmax,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const float xmin, const float xmax,
+	const float ymin, const float ymax,
+	const float zmin, const float zmax,
+	const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const double xmin, const double xmax,
+	const double ymin, const double ymax,
+	const double zmin, const double zmax,
+	const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const float xmin, const float xmax,
+	const float ymin, const float ymax,
+	const float zmin, const float zmax,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const double* uin, const double* vin, const double* win,
+	const char* name, const char* vname, const char* wname,
+	const double xmin, const double xmax,
+	const double ymin, const double ymax,
+	const double zmin, const double zmax,
+	const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write tecplot-2d output, F(x,y) * //
+template bool nse::write_tecplot_2d(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_2d(const std::string& filename,
+	float** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename,
+	double** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_2d(const std::string& filename,
+	const float* uin, const float* vin, const char* uname, const char* vname,
+	const nse_const3d::axisType axis,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename,
+	const double* uin, const double* vin, const char* uname, const char* vname,
+	const nse_const3d::axisType axis,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	float** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	double** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	const float* uin, const float* vin, const char* uname, const char* vname,
+	const nse_const3d::axisType axis,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	const double* uin, const double* vin, const char* uname, const char* vname,
+	const nse_const3d::axisType axis,
+	const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write tecplot-1d output, F(x) * //
+template bool nse::write_tecplot_1d(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write tecplot-1d scaled output, F(x) * //
+template bool nse::write_tecplot_1d(const std::string& filename,
+	const float* xin, const char* name,
+	const float vscale, const float cscale,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename,
+	const double* xin, const char* name,
+	const double vscale, const double cscale,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const float vscale, const float cscale,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const double vscale, const double cscale,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write tecplot-1d output, F{i}(x), {i=1,nvar} * //
+template bool nse::write_tecplot_1d(const std::string& filename,
+	float** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename,
+	double** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	float** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	double** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_1d(const std::string& filename,
+	float** xin, const std::string* name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename,
+	double** xin, const std::string* name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	float** xin, const std::string* name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	double** xin, const std::string* name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write binary-1d|2d|3d output, axis = n-dims * //
+template bool nse::write_binary(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_binary(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis,
+	const Grid3d< double >& grid, const double time);
+// -------------------------------------------------------------------- //
+
+// * intialize: write binary-3D output * //
+template bool nse::write_binary_stamp(const std::string& filename,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_stamp(const std::string& filename,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary_stamp(const std::string& filename,
+	const binNamedStamp< int >& index_stamp,
+	const binNamedStamp< double >& cpu_stamp,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_stamp(const std::string& filename,
+	const binNamedStamp< int >& index_stamp,
+	const binNamedStamp< double >& cpu_stamp,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary_3d(const std::string& filename,
+	const float* xin, const char* name,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_3d(const std::string& filename,
+	const double* xin, const char* name,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary_3d(const std::string& filename,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_3d(const std::string& filename,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary_stamp(const std::string& filename, const int idx,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_stamp(const std::string& filename, const int idx,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary_stamp(const std::string& filename, const int idx,
+	const binNamedStamp< int >& index_stamp,
+	const binNamedStamp< double >& cpu_stamp,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_stamp(const std::string& filename, const int idx,
+	const binNamedStamp< int >& index_stamp,
+	const binNamedStamp< double >& cpu_stamp,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary_3d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_3d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary_3d(const std::string& filename, const int idx,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_3d(const std::string& filename, const int idx,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * intialize: write binary-2D output * //
+template bool nse::write_binary_2d(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_2d(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary_2d(const std::string& filename,
+	const float* uin, const float* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_2d(const std::string& filename,
+	const double* uin, const double* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary_2d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_2d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary_2d(const std::string& filename, const int idx,
+	const float* uin, const float* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary_2d(const std::string& filename, const int idx,
+	const double* uin, const double* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * intialize: read binary-3D input * //
+template bool nse::read_binary_stamp(const std::string& filename,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+	const Grid3d< float >& grid, float* time);
+template bool nse::read_binary_stamp(const std::string& filename,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+	const Grid3d< double >& grid, double* time);
+
+template bool nse::read_binary_stamp(const std::string& filename,
+	binNamedStamp< int >& index_stamp,
+	binNamedStamp< double >& cpu_stamp,
+	const Grid3d< float >& grid, float* time);
+template bool nse::read_binary_stamp(const std::string& filename,
+	binNamedStamp< int >& index_stamp,
+	binNamedStamp< double >& cpu_stamp,
+	const Grid3d< double >& grid, double* time);
+
+template bool nse::read_binary_3d(const std::string& filename,
+	float* xout, const Grid3d< float >& grid);
+template bool nse::read_binary_3d(const std::string& filename,
+	double* xout, const Grid3d< double >& grid);
+
+template bool nse::read_binary_3d(const std::string& filename,
+	float* xout, const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template bool nse::read_binary_3d(const std::string& filename,
+	double* xout, const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template bool nse::read_binary_3d(const std::string& filename,
+	float* uout, float* vout, float* wout, const Grid3d< float >& grid);
+template bool nse::read_binary_3d(const std::string& filename,
+	double* uout, double* vout, double* wout, const Grid3d< double >& grid);
+
+template bool nse::read_binary_stamp(const std::string& filename, const int idx,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+	const Grid3d< float >& grid, float* time);
+template bool nse::read_binary_stamp(const std::string& filename, const int idx,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+	const Grid3d< double >& grid, double* time);
+
+template bool nse::read_binary_stamp(const std::string& filename, const int idx,
+	binNamedStamp< int >& index_stamp,
+	binNamedStamp< double >& cpu_stamp,
+	const Grid3d< float >& grid, float* time);
+template bool nse::read_binary_stamp(const std::string& filename, const int idx,
+	binNamedStamp< int >& index_stamp,
+	binNamedStamp< double >& cpu_stamp,
+	const Grid3d< double >& grid, double* time);
+
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	float* xout, const Grid3d< float >& grid);
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	double* xout, const Grid3d< double >& grid);
+
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	float* xout, const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	double* xout, const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	float* uout, float* vout, float* wout, const Grid3d< float >& grid);
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	double* uout, double* vout, double* wout, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * intialize: read binary-2D input * //
+template bool nse::read_binary_2d(const std::string& filename,
+	float* xout, const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary_2d(const std::string& filename,
+	double* xout, const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary_2d(const std::string& filename,
+	float* xout, const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary_2d(const std::string& filename,
+	double* xout, const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary_2d(const std::string& filename,
+	float* uout, float* vout, const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary_2d(const std::string& filename,
+	double* uout, double* vout, const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	float* xout, const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	double* xout, const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	float* xout, const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	double* xout, const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	float* uout, float* vout, const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	double* uout, double* vout, const nse_const3d::axisType axis, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * intialize: MPI-write binary-3D output * //
+template bool nse::mpi_write_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	const float* xin, const char* name,
+	const Grid3d< float >& grid, const float time);
+template bool nse::mpi_write_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	const double* xin, const char* name,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::mpi_write_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float >& grid, const float time);
+template bool nse::mpi_write_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::mpi_write_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const float* xin, const char* name,
+	const Grid3d< float >& grid, const float time);
+template bool nse::mpi_write_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const double* xin, const char* name,
+	const Grid3d< double >& grid, const double time);
+
+template bool nse::mpi_write_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float >& grid, const float time);
+template bool nse::mpi_write_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * intialize: MPI-write binary-2D output * //
+template bool nse::mpi_write_binary_2d(const std::string& filename,
+	const char* mpi_datarep,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::mpi_write_binary_2d(const std::string& filename,
+	const char* mpi_datarep,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::mpi_write_binary_2d(const std::string& filename,
+	const char* mpi_datarep,
+	const float* uin, const float* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::mpi_write_binary_2d(const std::string& filename,
+	const char* mpi_datarep,
+	const double* uin, const double* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::mpi_write_binary_2d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::mpi_write_binary_2d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::mpi_write_binary_2d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const float* uin, const float* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::mpi_write_binary_2d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const double* uin, const double* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * intialize: MPI-read binary-3D output * //
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	float* xout, const Grid3d< float >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	double* xout, const Grid3d< double >& grid);
+
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	float* xout, const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	double* xout, const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	float* uout, float* vout, float* wout, const Grid3d< float >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	double* uout, double* vout, double* wout, const Grid3d< double >& grid);
+
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	float* xout, const Grid3d< float >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	double* xout, const Grid3d< double >& grid);
+
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	float* xout, const nse_const3d::nodeType node, const Grid3d< float >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	double* xout, const nse_const3d::nodeType node, const Grid3d< double >& grid);
+
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	float* uout, float* vout, float* wout, const Grid3d< float >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	double* uout, double* vout, double* wout, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+#ifndef EXCLUDE_GPU_BRANCH
+// * write tecplot-1d|2d|3d output, axis = n-dims * //
+template bool nse::write_tecplot(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * intialize:  write tecplot-3d output, F(x,y,z) * //
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double, memGPU >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write tecplot-3d output [sub-domain], F(x,y,z) * //
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const float* xin, const char* name,
+	const float xmin, const float xmax,
+	const float ymin, const float ymax,
+	const float zmin, const float zmax,
+	const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const double* xin, const char* name,
+	const double xmin, const double xmax,
+	const double ymin, const double ymax,
+	const double zmin, const double zmax,
+	const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const float xmin, const float xmax,
+	const float ymin, const float ymax,
+	const float zmin, const float zmax,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const double xmin, const double xmax,
+	const double ymin, const double ymax,
+	const double zmin, const double zmax,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const float xmin, const float xmax,
+	const float ymin, const float ymax,
+	const float zmin, const float zmax,
+	const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const double xmin, const double xmax,
+	const double ymin, const double ymax,
+	const double zmin, const double zmax,
+	const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const float xmin, const float xmax,
+	const float ymin, const float ymax,
+	const float zmin, const float zmax,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_3d(const std::string& filename, const int idx,
+	const double* uin, const double* vin, const double* win,
+	const char* name, const char* vname, const char* wname,
+	const double xmin, const double xmax,
+	const double ymin, const double ymax,
+	const double zmin, const double zmax,
+	const Grid3d< double, memGPU >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write tecplot-2d output, F(x,y) * //
+template bool nse::write_tecplot_2d(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_2d(const std::string& filename,
+	float** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename,
+	double** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_2d(const std::string& filename,
+	const float* uin, const float* vin, const char* uname, const char* vname,
+	const nse_const3d::axisType axis,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename,
+	const double* uin, const double* vin, const char* uname, const char* vname,
+	const nse_const3d::axisType axis,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	float** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	double** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	const float* uin, const float* vin, const char* uname, const char* vname,
+	const nse_const3d::axisType axis,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_2d(const std::string& filename, const int idx,
+	const double* uin, const double* vin, const char* uname, const char* vname,
+	const nse_const3d::axisType axis,
+	const Grid3d< double, memGPU >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write tecplot-1d output, F(x) * //
+template bool nse::write_tecplot_1d(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write tecplot-1d scaled output, F(x) * //
+template bool nse::write_tecplot_1d(const std::string& filename,
+	const float* xin, const char* name,
+	const float vscale, const float cscale,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename,
+	const double* xin, const char* name,
+	const double vscale, const double cscale,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const float vscale, const float cscale,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const double vscale, const double cscale,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write tecplot-1d output, F{i}(x), {i=1,nvar} * //
+template bool nse::write_tecplot_1d(const std::string& filename,
+	float** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename,
+	double** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	float** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_tecplot_1d(const std::string& filename, const int idx,
+	double** xin, const char** name, const int nvar,
+	const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+	const Grid3d< double, memGPU >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * initialize: write binary-1d|2d|3d output, axis = n-dims * //
+template bool nse::write_binary(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_binary(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis,
+	const Grid3d< double, memGPU >& grid, const double time);
+// -------------------------------------------------------------------- //
+
+// * intialize: write binary-3D output * //
+template bool nse::write_binary_stamp(const std::string& filename,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+	const Grid3d< float, memGPU >& grid, const float time);
+
+template bool nse::write_binary_stamp(const std::string& filename,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_binary_3d(const std::string& filename,
+	const float* xin, const char* name,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_binary_3d(const std::string& filename,
+	const double* xin, const char* name,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_binary_3d(const std::string& filename,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_binary_3d(const std::string& filename,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_binary_stamp(const std::string& filename, const int idx,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+	const Grid3d< float, memGPU >& grid, const float time);
+
+template bool nse::write_binary_stamp(const std::string& filename, const int idx,
+	const binStamp< int >& index_stamp,
+	const binStamp< double >& cpu_stamp,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_binary_3d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_binary_3d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_binary_3d(const std::string& filename, const int idx,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_binary_3d(const std::string& filename, const int idx,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double, memGPU >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * intialize: write binary-2D output * //
+template bool nse::write_binary_2d(const std::string& filename,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_binary_2d(const std::string& filename,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_binary_2d(const std::string& filename,
+	const float* uin, const float* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_binary_2d(const std::string& filename,
+	const double* uin, const double* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_binary_2d(const std::string& filename, const int idx,
+	const float* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_binary_2d(const std::string& filename, const int idx,
+	const double* xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::write_binary_2d(const std::string& filename, const int idx,
+	const float* uin, const float* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::write_binary_2d(const std::string& filename, const int idx,
+	const double* uin, const double* vin,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< double, memGPU >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * intialize: read binary-3D input * //
+template bool nse::read_binary_stamp(const std::string& filename,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+	const Grid3d< float, memGPU >& grid, float* time);
+template bool nse::read_binary_stamp(const std::string& filename,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+	const Grid3d< double, memGPU >& grid, double* time);
+
+template bool nse::read_binary_3d(const std::string& filename,
+	float* xout, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_3d(const std::string& filename,
+	double* xout, const Grid3d< double, memGPU >& grid);
+
+template bool nse::read_binary_3d(const std::string& filename,
+	float* xout, const nse_const3d::nodeType node, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_3d(const std::string& filename,
+	double* xout, const nse_const3d::nodeType node, const Grid3d< double, memGPU >& grid);
+
+template bool nse::read_binary_3d(const std::string& filename,
+	float* uout, float* vout, float* wout, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_3d(const std::string& filename,
+	double* uout, double* vout, double* wout, const Grid3d< double, memGPU >& grid);
+
+template bool nse::read_binary_stamp(const std::string& filename, const int idx,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+	const Grid3d< float, memGPU >& grid, float* time);
+template bool nse::read_binary_stamp(const std::string& filename, const int idx,
+	binStamp< int >& index_stamp,
+	binStamp< double >& cpu_stamp,
+	const Grid3d< double, memGPU >& grid, double* time);
+
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	float* xout, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	double* xout, const Grid3d< double, memGPU >& grid);
+
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	float* xout, const nse_const3d::nodeType node, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	double* xout, const nse_const3d::nodeType node, const Grid3d< double, memGPU >& grid);
+
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	float* uout, float* vout, float* wout, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_3d(const std::string& filename, const int idx,
+	double* uout, double* vout, double* wout, const Grid3d< double, memGPU >& grid);
+// ------------------------------------------------------------------------ //
+
+// * intialize: read binary-2D input * //
+template bool nse::read_binary_2d(const std::string& filename,
+	float* xout, const nse_const3d::axisType axis, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_2d(const std::string& filename,
+	double* xout, const nse_const3d::axisType axis, const Grid3d< double, memGPU >& grid);
+
+template bool nse::read_binary_2d(const std::string& filename,
+	float* xout, const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_2d(const std::string& filename,
+	double* xout, const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< double, memGPU >& grid);
+
+template bool nse::read_binary_2d(const std::string& filename,
+	float* uout, float* vout, const nse_const3d::axisType axis, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_2d(const std::string& filename,
+	double* uout, double* vout, const nse_const3d::axisType axis, const Grid3d< double, memGPU >& grid);
+
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	float* xout, const nse_const3d::axisType axis, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	double* xout, const nse_const3d::axisType axis, const Grid3d< double, memGPU >& grid);
+
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	float* xout, const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	double* xout, const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< double, memGPU >& grid);
+
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	float* uout, float* vout, const nse_const3d::axisType axis, const Grid3d< float, memGPU >& grid);
+template bool nse::read_binary_2d(const std::string& filename, const int idx,
+	double* uout, double* vout, const nse_const3d::axisType axis, const Grid3d< double, memGPU >& grid);
+// ------------------------------------------------------------------------ //
+
+// * intialize: MPI-write binary output * //
+template bool nse::mpi_write_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	const float* xin, const char* name,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::mpi_write_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	const double* xin, const char* name,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::mpi_write_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::mpi_write_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::mpi_write_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const float* xin, const char* name,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::mpi_write_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const double* xin, const char* name,
+	const Grid3d< double, memGPU >& grid, const double time);
+
+template bool nse::mpi_write_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const float* uin, const float* vin, const float* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< float, memGPU >& grid, const float time);
+template bool nse::mpi_write_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	const double* uin, const double* vin, const double* win,
+	const char* uname, const char* vname, const char* wname,
+	const Grid3d< double, memGPU >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * intialize: MPI-read binary-3D output * //
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	float* xout, const Grid3d< float, memGPU >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	double* xout, const Grid3d< double, memGPU >& grid);
+
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	float* xout, const nse_const3d::nodeType node, const Grid3d< float, memGPU >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	double* xout, const nse_const3d::nodeType node, const Grid3d< double, memGPU >& grid);
+
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	float* uout, float* vout, float* wout, const Grid3d< float, memGPU >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename,
+	const char* mpi_datarep,
+	double* uout, double* vout, double* wout, const Grid3d< double, memGPU >& grid);
+
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	float* xout, const Grid3d< float, memGPU >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	double* xout, const Grid3d< double, memGPU >& grid);
+
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	float* xout, const nse_const3d::nodeType node, const Grid3d< float, memGPU >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	double* xout, const nse_const3d::nodeType node, const Grid3d< double, memGPU >& grid);
+
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	float* uout, float* vout, float* wout, const Grid3d< float, memGPU >& grid);
+template bool nse::mpi_read_binary_3d(const std::string& filename, const int idx,
+	const char* mpi_datarep,
+	double* uout, double* vout, double* wout, const Grid3d< double, memGPU >& grid);
+// ------------------------------------------------------------------------ //
+#endif
diff --git a/nse-io3d.h b/nse-io3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..50c4b60bd014cd21a63376ba3d1c5a5ebb2795a0
--- /dev/null
+++ b/nse-io3d.h
@@ -0,0 +1,440 @@
+#pragma once
+
+// [nse-io3d.h]: I/O on 3D Grids
+//
+// -------------------------------------------------------------------------------------------- //
+// TO DO:
+// 
+
+#include "nse-sys.h"
+#include "grid3d.h"
+
+#include "bin-stamp.h"
+#include "bin-named-stamp.h"
+
+#include <string>
+
+
+namespace nse
+{
+	// * write tecplot-1d|2d|3d output, axis = n-dims * //
+	template< memType mem, typename T >
+	bool write_tecplot(const std::string& filename,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node, 
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot(const std::string& filename, const int idx,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,
+		const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * write tecplot-3d output, F(x,y,z) * //
+	template< memType mem, typename T >
+	bool write_tecplot_3d(const std::string& filename,
+		const T* xin, const char* name,
+		const nse_const3d::nodeType node, const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_3d(const std::string& filename,
+		const T* uin, const T* vin, const T* win,
+		const char* uname, const char* vname, const char* wname,
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_3d(const std::string& filename, const int idx,
+		const T* xin, const char* name,
+		const nse_const3d::nodeType node, const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_3d(const std::string& filename, const int idx,
+		const T* uin, const T* vin, const T* win,
+		const char* uname, const char* vname, const char* wname,
+		const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * write tecplot-3d output [sub-domain], F(x,y,z) * //
+	template< memType mem, typename T >
+	bool write_tecplot_3d(const std::string& filename,
+		const T* xin, const char* name,
+		const T xmin, const T xmax,
+		const T ymin, const T ymax,
+		const T zmin, const T zmax,
+		const nse_const3d::nodeType node, const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_3d(const std::string& filename,
+		const T* uin, const T* vin, const T* win,
+		const char* uname, const char* vname, const char* wname,
+		const T xmin, const T xmax,
+		const T ymin, const T ymax,
+		const T zmin, const T zmax,
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_3d(const std::string& filename, const int idx,
+		const T* xin, const char* name,
+		const T xmin, const T xmax,
+		const T ymin, const T ymax,
+		const T zmin, const T zmax,
+		const nse_const3d::nodeType node, const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_3d(const std::string& filename, const int idx,
+		const T* uin, const T* vin, const T* win,
+		const char* uname, const char* vname, const char* wname,
+		const T xmin, const T xmax,
+		const T ymin, const T ymax,
+		const T zmin, const T zmax,
+		const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * write tecplot-2d output, F(x,y) * //
+	template< memType mem, typename T >
+	bool write_tecplot_2d(const std::string& filename,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[XY,XZ,YZ], node=[C,(U|V,U|W,V|W)]
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_2d(const std::string& filename,
+		T** xin, const char** name, const int nvar,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[XY,XZ,YZ], node=[C,(U|V,U|W,V|W)]
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_2d(const std::string& filename,
+		const T* uin, const T* vin, const char* uname, const char* vname,
+		const nse_const3d::axisType axis,	// axis=[XY,XZ,YZ], node=[C]
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_2d(const std::string& filename, const int idx,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[XY,XZ,YZ], node=[C,(U|V,U|W,V|W)]
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_2d(const std::string& filename, const int idx,
+		T** xin, const char** name, const int nvar,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[XY,XZ,YZ], node=[C,(U|V,U|W,V|W)]
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_2d(const std::string& filename, const int idx,
+		const T* uin, const T* vin, const char* uname, const char* vname,
+		const nse_const3d::axisType axis,	// axis=[XY,XZ,YZ], node=[C]
+		const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * write tecplot-1d output, F(x) * //
+	template< memType mem, typename T >
+	bool write_tecplot_1d(const std::string& filename,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_1d(const std::string& filename, const int idx,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+		const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * write tecplot-1d scaled output, F(x) * //
+	template< memType mem, typename T >
+	bool write_tecplot_1d(const std::string& filename,
+		const T* xin, const char* name,
+		const T vscale, const T cscale,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_1d(const std::string& filename, const int idx,
+		const T* xin, const char* name,
+		const T vscale, const T cscale,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+		const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * write tecplot-1d output, F{i}(x), {i=1,nvar} * //
+	template< memType mem, typename T >
+	bool write_tecplot_1d(const std::string& filename,
+		T** xin, const char** name, const int nvar,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_1d(const std::string& filename, const int idx,
+		T** xin, const char** name, const int nvar,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_1d(const std::string& filename,
+		T** xin, const std::string* name, const int nvar,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_tecplot_1d(const std::string& filename, const int idx,
+		T** xin, const std::string* name, const int nvar,
+		const nse_const3d::axisType axis, const nse_const3d::nodeType node,	// axis=[X,Y,Z], node=[C,(U,V,W)]
+		const Grid3d< T, mem >& grid, const T time);
+
+	// -------------------------------------------------------------------- //
+
+
+	// * write binary-1d|2d|3d output, axis = n-dims, 1D - NOT supported, returns false * //
+	template< memType mem, typename T >
+	bool write_binary(const std::string& filename,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis,
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_binary(const std::string& filename, const int idx,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis,
+		const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+
+	// * write binary-3D output * //
+	template< memType mem, typename T >
+	bool write_binary_stamp(const std::string& filename,
+		const binStamp< int >& index_stamp,
+		const binStamp< double >& cpu_stamp,
+		const Grid3d< T, mem >& grid, const T time);
+	template< memType mem, typename T >
+	bool write_binary_stamp(const std::string& filename,
+		const binNamedStamp< int >& index_stamp,
+		const binNamedStamp< double >& cpu_stamp,
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_binary_3d(const std::string& filename,
+		const T* xin, const char* name,
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_binary_3d(const std::string& filename,
+		const T* uin, const T* vin, const T* win,
+		const char* uname, const char* vname, const char* wname,
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_binary_stamp(const std::string& filename, const int idx,
+		const binStamp< int >& index_stamp,
+		const binStamp< double >& cpu_stamp,
+		const Grid3d< T, mem >& grid, const T time);
+	template< memType mem, typename T >
+	bool write_binary_stamp(const std::string& filename, const int idx,
+		const binNamedStamp< int >& index_stamp,
+		const binNamedStamp< double >& cpu_stamp,
+		const Grid3d< T, mem >& grid, const T time);
+
+
+	template< memType mem, typename T >
+	bool write_binary_3d(const std::string& filename, const int idx,
+		const T* xin, const char* name,
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_binary_3d(const std::string& filename, const int idx,
+		const T* uin, const T* vin, const T* win,
+		const char* uname, const char* vname, const char* wname,
+		const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * write binary-2D output * //
+	template< memType mem, typename T >
+	bool write_binary_2d(const std::string& filename,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_binary_2d(const std::string& filename,
+		const T* uin, const T* vin,
+		const char* uname, const char* vname,
+		const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_binary_2d(const std::string& filename, const int idx,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool write_binary_2d(const std::string& filename, const int idx,
+		const T* uin, const T* vin,
+		const char* uname, const char* vname,
+		const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * read binary-3D input * //
+	template< memType mem, typename T >
+	bool read_binary_stamp(const std::string& filename,
+		binStamp< int >& index_stamp,
+		binStamp< double >& cpu_stamp,
+		const Grid3d< T, mem >& grid, T* time);
+	template< memType mem, typename T >
+	bool read_binary_stamp(const std::string& filename,
+		binNamedStamp< int >& index_stamp,
+		binNamedStamp< double >& cpu_stamp,
+		const Grid3d< T, mem >& grid, T* time);
+
+	template< memType mem, typename T >
+	bool read_binary_3d(const std::string& filename,
+		T* xout, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool read_binary_3d(const std::string& filename,
+		T* xout, const nse_const3d::nodeType node, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool read_binary_3d(const std::string& filename,
+		T* uout, T* vout, T* wout, const Grid3d< T, mem >& grid);
+
+
+	template< memType mem, typename T >
+	bool read_binary_stamp(const std::string& filename, const int idx,
+		binStamp< int >& index_stamp,
+		binStamp< double >& cpu_stamp,
+		const Grid3d< T, mem >& grid, T* time);
+	template< memType mem, typename T >
+	bool read_binary_stamp(const std::string& filename, const int idx,
+		binNamedStamp< int >& index_stamp,
+		binNamedStamp< double >& cpu_stamp,
+		const Grid3d< T, mem >& grid, T* time);
+
+	template< memType mem, typename T >
+	bool read_binary_3d(const std::string& filename, const int idx,
+		T* xout, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool read_binary_3d(const std::string& filename, const int idx,
+		T* xout, const nse_const3d::nodeType node, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool read_binary_3d(const std::string& filename, const int idx,
+		T* uout, T* vout, T* wout, const Grid3d< T, mem >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * read binary-2D input * //
+	// --- no grid reinterpolation & no MPI inside
+	template< memType mem, typename T >
+	bool read_binary_2d(const std::string& filename,
+		T* xout, const nse_const3d::axisType axis, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool read_binary_2d(const std::string& filename,
+		T* xout, const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool read_binary_2d(const std::string& filename,
+		T* uout, T* vout, const nse_const3d::axisType axis, const Grid3d< T, mem >& grid);
+
+
+	template< memType mem, typename T >
+	bool read_binary_2d(const std::string& filename, const int idx,
+		T* xout, const nse_const3d::axisType axis, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool read_binary_2d(const std::string& filename, const int idx,
+		T* xout, const nse_const3d::nodeType node, const nse_const3d::axisType axis, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool read_binary_2d(const std::string& filename, const int idx,
+		T* uout, T* vout, const nse_const3d::axisType axis, const Grid3d< T, mem >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * MPI-write binary-3D output * //
+	template< memType mem, typename T >
+	bool mpi_write_binary_3d(const std::string& filename,
+		const char* mpi_datarep,
+		const T* xin, const char* name,
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool mpi_write_binary_3d(const std::string& filename,
+		const char* mpi_datarep,
+		const T* uin, const T* vin, const T* win,
+		const char* uname, const char* vname, const char* wname,
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool mpi_write_binary_3d(const std::string& filename, const int idx,
+		const char* mpi_datarep,
+		const T* xin, const char* name,
+		const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool mpi_write_binary_3d(const std::string& filename, const int idx,
+		const char* mpi_datarep,
+		const T* uin, const T* vin, const T* win,
+		const char* uname, const char* vname, const char* wname,
+		const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * MPI-write binary-2D output * //
+	template< memType mem, typename T >
+	bool mpi_write_binary_2d(const std::string& filename,
+		const char* mpi_datarep,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool mpi_write_binary_2d(const std::string& filename,
+		const char* mpi_datarep,
+		const T* uin, const T* vin,
+		const char* uname, const char* vname,
+		const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool mpi_write_binary_2d(const std::string& filename, const int idx,
+		const char* mpi_datarep,
+		const T* xin, const char* name,
+		const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time);
+
+	template< memType mem, typename T >
+	bool mpi_write_binary_2d(const std::string& filename, const int idx,
+		const char* mpi_datarep,
+		const T* uin, const T* vin,
+		const char* uname, const char* vname,
+		const nse_const3d::axisType axis, const Grid3d< T, mem >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * MPI-read binary-3D input * //
+	template< memType mem, typename T >
+	bool mpi_read_binary_3d(const std::string& filename,
+		const char* mpi_datarep,
+		T* xout, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool mpi_read_binary_3d(const std::string& filename,
+		const char* mpi_datarep,
+		T* xout, const nse_const3d::nodeType node, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool mpi_read_binary_3d(const std::string& filename,
+		const char* mpi_datarep,
+		T* uout, T* vout, T* wout, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool mpi_read_binary_3d(const std::string& filename, const int idx,
+		const char* mpi_datarep,
+		T* xout, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool mpi_read_binary_3d(const std::string& filename, const int idx,
+		const char* mpi_datarep,
+		T* xout, const nse_const3d::nodeType node, const Grid3d< T, mem >& grid);
+
+	template< memType mem, typename T >
+	bool mpi_read_binary_3d(const std::string& filename, const int idx,
+		const char* mpi_datarep,
+		T* uout, T* vout, T* wout, const Grid3d< T, mem >& grid);
+	// -------------------------------------------------------------------- //
+}
diff --git a/nse-output-pf.h b/nse-output-pf.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e92ed2246f3e5cc8dc03488de3691347ac036c6
--- /dev/null
+++ b/nse-output-pf.h
@@ -0,0 +1,95 @@
+#pragma once
+
+// [nse-output-pf.h]: profile rapid output data structure for main flow fields
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+
+#include "wstgrid3d.h"
+#include "str-com.h"
+
+#include <string>
+
+
+template< typename T >
+struct nseOutputPf {
+
+	std::string DIR;
+
+	std::string U_VELOCITY_FILE,
+		V_VELOCITY_FILE, W_VELOCITY_FILE;
+
+
+	T begin, end, dt;
+
+	int index;
+	T mark;
+
+	bool plt_cntrl;		// .plt rapid output control flag
+
+	static const int npy = 5;
+	T **Upy, **Vpy, **Wpy;	// -y profiles
+
+
+	bool status;	// allocation status
+
+
+	bool set_filenames(const std::string& _DIR);
+	bool allocate(const nse::wstGrid3d< T >& grid);
+
+	nseOutputPf();
+	~nseOutputPf();
+};
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+inline nseOutputPf< T >::nseOutputPf() : plt_cntrl(true)
+{
+	status = false;
+}
+
+template< typename T >
+inline nseOutputPf< T >::~nseOutputPf()
+{
+	if (status)
+	{
+		for (int k = 0; k < npy; k++)
+			nse::deallocate(Upy[k], Vpy[k], Wpy[k]);
+		delete[] Upy;
+		delete[] Vpy;
+		delete[] Wpy;
+	}
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+inline bool nseOutputPf< T >::set_filenames(const std::string& _DIR)
+{
+	if (!nse::create_dir(_DIR)) return false;
+	DIR = _DIR;
+
+	U_VELOCITY_FILE = DIR + "-u(y)-rapid-.plt";
+	V_VELOCITY_FILE = DIR + "-v(y)-rapid-.plt";
+	W_VELOCITY_FILE = DIR + "-w(y)-rapid-.plt";
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+inline bool nseOutputPf< T >::allocate(
+	const nse::wstGrid3d< T >& grid)
+{
+	Upy = new T*[npy];
+	Vpy = new T*[npy];
+	Wpy = new T*[npy];
+	for (int k = 0; k < npy; k++)
+		nse::allocate_vnull(&Upy[k], &Vpy[k], &Wpy[k], grid.ny);
+
+	status = true;
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/nse-output.h b/nse-output.h
new file mode 100644
index 0000000000000000000000000000000000000000..d93a2870da885b9d080ad32e9d711e44ce0c1413
--- /dev/null
+++ b/nse-output.h
@@ -0,0 +1,106 @@
+#pragma once
+
+// [nse-output.h]: output data structure for main flow fields
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+#include "str-com.h"
+
+#include <string>
+
+template< typename T >
+struct nseOutput
+{
+	std::string DIR;
+
+	std::string DATA_FILE;
+
+	std::string NSE_SEQ_FILE;
+
+	std::string VELOCITY_FILE, VELOCITY_BIN_FILE;
+	std::string PRESSURE_FILE, PRESSURE_BIN_FILE;
+#ifdef STRATIFICATION
+	std::string TEMPERATURE_FILE, TEMPERATURE_BIN_FILE;
+#endif
+
+#ifdef INCLUDE_PARTICLES
+	std::string PTCL_FILE, PTCL_BIN_FILE;
+#endif
+#ifdef INCLUDE_PARTICLES_TRACKING
+	std::string PTCL_TRAJ_SUBDIR,
+		PTCL_TRAJ_FILE,
+		PTCL_TRAJ_BIN_FILE;
+#endif
+
+	T begin, dt;
+
+	T xmin, xmax;
+	T ymin, ymax;
+	T zmin, zmax;
+
+	int index;
+	T mark;
+
+	bool regular_plt3d_cntrl;	// 3D .plt field -regular- output control flag
+	bool regular_bin3d_cntrl;	// 3D .nsx field -regular- output control flag
+	bool final_plt3d_cntrl;		// 3D .plt field -final- output control flag
+
+	bool set_filenames(const std::string& _DIR);
+
+	nseOutput();
+	~nseOutput();
+};
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+inline nseOutput< T >::nseOutput() : 
+	regular_plt3d_cntrl(true), 
+	regular_bin3d_cntrl(true),
+	final_plt3d_cntrl(true) 
+{
+}
+
+template< typename T >
+inline nseOutput< T >::~nseOutput() {}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+inline bool nseOutput< T >::set_filenames(const std::string& _DIR)
+{
+	if (!nse::create_dir(_DIR)) return false;
+	DIR = _DIR;
+
+	DATA_FILE = DIR + "nse-data.txt";
+
+	NSE_SEQ_FILE = DIR + "nse.dsq";
+
+	VELOCITY_FILE = DIR + "-velocity-.plt";
+	VELOCITY_BIN_FILE = DIR + "velocity-bin.nsx";
+
+	PRESSURE_FILE = DIR + "-pressure-.plt";
+	PRESSURE_BIN_FILE = DIR + "pressure-bin.nsx";
+
+#ifdef STRATIFICATION
+	TEMPERATURE_FILE = DIR + "-temperature-.plt";
+	TEMPERATURE_BIN_FILE = DIR + "temperature-bin.nsx";
+#endif
+
+#ifdef INCLUDE_PARTICLES
+	PTCL_FILE = DIR + "-particles-.plt";
+	PTCL_BIN_FILE = DIR + "particles-bin.nsx";
+#endif
+
+#ifdef INCLUDE_PARTICLES_TRACKING
+	PTCL_TRAJ_SUBDIR = "traj/";
+	if (!nse::create_dir(DIR + PTCL_TRAJ_SUBDIR)) return false;
+
+	PTCL_TRAJ_FILE = DIR + PTCL_TRAJ_SUBDIR + "-traj-.plt";
+	PTCL_TRAJ_BIN_FILE = DIR + PTCL_TRAJ_SUBDIR + "traj-bin.psx";
+#endif
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/nse-startup.h b/nse-startup.h
new file mode 100644
index 0000000000000000000000000000000000000000..76a40905454f6de9791ad9f65858a2e51eac91b1
--- /dev/null
+++ b/nse-startup.h
@@ -0,0 +1,55 @@
+#pragma once
+
+// [nse-startup.h]: startup data structure for main flow fields
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+#include "str-com.h"
+
+#include <string>
+
+
+template< typename T >
+struct nseStartup {
+
+	std::string DIR;
+
+	std::string VELOCITY_FILE;
+	std::string PRESSURE_FILE;
+#ifdef STRATIFICATION
+	std::string TEMPERATURE_FILE;
+#endif
+
+	bool load_files;	// using to load initial fields flag, default: [false]
+
+	bool set_filenames(const std::string& _DIR);
+
+	nseStartup();
+	~nseStartup();
+};
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+inline nseStartup< T >::nseStartup() : load_files(false) {}
+template< typename T >
+inline nseStartup< T >::~nseStartup() {}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+inline bool nseStartup< T >::set_filenames(const std::string& _DIR)
+{
+	if (!nse::create_dir(_DIR)) return false;
+	DIR = _DIR;
+
+	VELOCITY_FILE = DIR + "velocity-init.nsx";
+	PRESSURE_FILE = DIR + "pressure-init.nsx";
+#ifdef STRATIFICATION
+	TEMPERATURE_FILE = DIR + "temperature-init.nsx";
+#endif
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/nse-sys.h b/nse-sys.h
new file mode 100644
index 0000000000000000000000000000000000000000..950e7490501c979ca88d9f56eabc4560a5f9210b
--- /dev/null
+++ b/nse-sys.h
@@ -0,0 +1,117 @@
+#pragma once
+
+// : Guidelines //
+// - using "const TYPE* _RESTRICT const" as declaration of input array
+// - using "TYPE* _RESTRICT" as declaration of output array or data
+// - using "const TYPE" as declaration for numeric input parameters
+// - avoiding using _RESTRICT with mpi calls for now
+// - using _omp suffix for non-barrier OpenMP functions
+// ------------------------------------------------------------------------------
+
+
+#define USE_RESTRICT_KEY			// enable 'restrict' keyword
+
+#define ALIGN_ALLOCATION	64		// aligned allocation, alignment in bytes (2^n values)
+
+#define MIN_MEMCPY_BLOCK	256		// minimum block (in bytes) for memcpy copy (magic number)
+
+//#define USE_EXPLICIT_SSE			// declare explicit SSE vector math (deprecated)
+									// all following definitions should obey this key!
+
+// OpenMP versions id's
+#define _OPENMP_VERSION_20	200203
+#define _OPENMP_VERSION_25	200505
+#define _OPENMP_VERSION_30	200805
+#define _OPENMP_VERSION_31	201107
+#define _OPENMP_VERSION_40	201307
+#define _OPENMP_VERSION_45	201511
+
+// : OpenMP>=3.1 support
+#if (_OPENMP >= _OPENMP_VERSION_31)
+#define USE_AS_OPENMP31
+#endif
+// : OpenMP>=4.0 support
+#if (_OPENMP >= _OPENMP_VERSION_40)
+#define USE_AS_OPENMP40
+#endif
+
+// : OpenMP 2D decomposition (i,j)
+#define USE_OPENMP_2D_CYCLE
+
+#define USE_OPENMP20_IN_MINMAX		// use OpenMP hand-coded min-max reductions
+									// OpenMP3.1 pragma should be preferred if defined
+
+//#define USE_OPENMP_SIMD				// : SIMD OpenMP macro (OpenMP >= 4.0)
+#define USE_INTEL_SIMD				// : SIMD Intel macro (openmp simd supersedes intel simd)
+
+//#define USE_DEPRECATED_COLOR_CP				// use old code for copy of colored blocks
+
+
+// MPI //
+//#define MPI_EXCH_KEEP_RECVSEND_PAIR			// keep recv-send pairs vs. [recv all]-[send all]
+#define USE_MPI_ALLREDUCE_IN_PLACE				// use MPI_IN_PLACE for MPI_ALLREDUCE calls
+
+// MPI I/O definitions //
+#define _NSE_MPI_IO
+#define _NSE_MPI_IO_DATAREP_DEFAULT		"native"
+#define _NSE_MPI_IO_RETRY_SEQ						// retry sequential I/O in case MPI-I/O fails
+
+
+//C++,2011 // 
+//#define USE_CXX_11			// use C++,2011 extensions //
+
+
+#define EXCLUDE_GPU_BRANCH		// exclude GPU-CUDA branch from compilation
+
+
+// _RESTRICT definition 
+// ------------------------------------------------------------------- //
+#if defined(USE_RESTRICT_KEY)
+#if defined(__INTEL_COMPILER)
+#define _RESTRICT restrict
+#elif defined(__GNUC__) && !defined(_WIN32) && !defined(_CYGWIN32__)
+#define _RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define _RESTRICT __restrict
+#else
+#define _RESTRICT
+#endif
+#else
+#define _RESTRICT
+#endif
+// ------------------------------------------------------------------- //
+
+// OpenMP 2D decomposition only for OpenMP>=3.1 && !(Cray compiler)
+// ------------------------------------------------------------------- //
+#if defined(USE_OPENMP_2D_CYCLE) && (!defined(USE_AS_OPENMP31))
+#undef USE_OPENMP_2D_CYCLE
+#endif
+// ------------------------------------------------------------------- //
+
+// OpenMP 2D decomposition not fully working on Cray compiler
+// ------------------------------------------------------------------- //
+#if defined(USE_OPENMP_2D_CYCLE) && (defined(_CRAYC))
+#undef USE_OPENMP_2D_CYCLE
+#endif
+// ------------------------------------------------------------------- //
+
+// OpenMP simd only for OpenMP >= 4.0
+// ------------------------------------------------------------------- //
+#if defined(USE_OPENMP_SIMD) && (!defined(USE_AS_OPENMP40))
+#undef USE_OPENMP_SIMD
+#endif
+// ------------------------------------------------------------------- //
+
+// SIMD only for Intel compiler >= 14.0.0
+// ------------------------------------------------------------------- //
+#if !defined(__INTEL_COMPILER)
+#undef USE_INTEL_SIMD
+#endif
+#if (__INTEL_COMPILER < 1400)
+#undef USE_INTEL_SIMD
+#endif
+#if defined(USE_OPENMP_SIMD)
+// --- using pragma omp simd instead
+#undef USE_INTEL_SIMD
+#endif
+// ------------------------------------------------------------------- //
diff --git a/nse-turb-vec.h b/nse-turb-vec.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7bba85cf69b1bfba94f3e8b5c4564926d795e4f
--- /dev/null
+++ b/nse-turb-vec.h
@@ -0,0 +1,540 @@
+#pragma once
+
+// [nse-turb-vec.h]: nse turbulence arrays container
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+#include "grid3d.h"
+
+#include "nse-avg-vec.h"
+
+
+template< typename T >
+struct nseTurbVec : public nseAvgVec< T >
+{
+	T *U_deviation, *V_deviation, *W_deviation,			// deviations: [C, C, W]
+		*UV_flux, *UW_flux, *VW_flux,					// ui'*uj': [C, W, W]
+		*PU_flux, *PV_flux, *PW_flux,					// p'*ui': [C, C, W]
+		*U2W_flux, *V2W_flux, *W2W_flux,				// ui'*ui'*w': [W, W, C]
+		*UVW_flux, *UWW_flux, *VWW_flux;				// ui'*uj'*w': [W, C, C]
+
+	T *U_grad, *V_grad, *W_grad;						// -z velocity gradients: [W, W, C]
+
+
+	T *momentum_balance,								// momentum balance eq.: [W]
+		*turbulent_momentum_flux, *viscous_stress;
+
+
+	T *TKE, *u_TKE, *v_TKE, *w_TKE,						// TKE (1/2*u'(i)*u'(i)) structure: [C]
+		*u_TKE_share, *v_TKE_share, *w_TKE_share;
+
+
+	T *TKE_balance,										// TKE balance eq.: [C]
+		*TKE_production, 
+		*TKE_diffusion, *TKE_transport, *TKE_pressure_work,
+		*TKE_exchange_balance,
+#ifdef STRATIFICATION
+		*TKE_heat_flux,
+#endif
+		*TKE_dissipation, *TKE_iso_dissipation;
+
+	T *u_TKE_balance,										// -u TKE component balance eq.: [C]
+		*u_TKE_production, 
+		*u_TKE_diffusion, *u_TKE_transport, *u_TKE_pressure_work,
+		*u_TKE_exchange,
+		*u_TKE_dissipation, *u_TKE_iso_dissipation;
+
+	T *v_TKE_balance,										// -v TKE component balance eq.: [C]
+		*v_TKE_production, 
+		*v_TKE_diffusion, *v_TKE_transport, *v_TKE_pressure_work,
+		*v_TKE_exchange,
+		*v_TKE_dissipation, *v_TKE_iso_dissipation;
+
+	T *w_TKE_balance,										// -w TKE component balance eq.: [C]
+		*w_TKE_production, 
+		*w_TKE_diffusion, *w_TKE_transport, *w_TKE_pressure_work,
+		*w_TKE_exchange,
+#ifdef STRATIFICATION
+		*w_TKE_heat_flux,
+#endif
+		*w_TKE_dissipation, *w_TKE_iso_dissipation;
+
+
+	T *TKE_aniso_uu, *TKE_aniso_vv, *TKE_aniso_ww,		// TKE anisotropy: [C]
+		*TKE_aniso_uv, *TKE_aniso_uw, *TKE_aniso_vw;
+
+
+	T *P2Suv_turb, *P2Suw_turb, *P2Svw_turb;		// P * 2 * Sij: [C, W, W]
+	T *P2Suw_turb_c, *P2Svw_turb_c;					// P * 2 * Sij: [C, C]
+
+
+	T *uv_budget_balance,								// u'*v' budget balance: [C]
+		*uv_production, *uv_production_shearU, *uv_production_shearV,
+		*uv_diffusion, *uv_transport, *uv_pressure_work,
+		*uv_dissipation, *uv_iso_dissipation;
+	
+	T *uw_budget_balance,								// u'*w' budget balance: [W]
+		*uw_production, *uw_production_shearU, *uw_production_shearW,
+		*uw_diffusion, *uw_transport, *uw_pressure_work,
+#ifdef STRATIFICATION
+		*uw_buoyancy,
+#endif
+		*uw_dissipation, *uw_iso_dissipation;
+
+	T *vw_budget_balance,								// v'*w' budget balance: [W]
+		*vw_production, *vw_production_shearV, *vw_production_shearW,
+		*vw_diffusion, *vw_transport, *vw_pressure_work,
+#ifdef STRATIFICATION
+		*vw_buoyancy,
+#endif
+		*vw_dissipation, *vw_iso_dissipation;
+
+
+	T *u_Rotta, *v_Rotta, *w_Rotta, *uw_Rotta;	// Rotta "return-to-isotropy" model constants: [C]
+	T *u_RDT, *v_RDT, *w_RDT, *uw_RDT;			// RDT "return-to-isotropy" model constants: [C]
+	T *Rotta_RDT_e, *Rotta_RDT_p;				// Rotta-RDT "return-to-isotropy" model constants: [C]
+
+	T *time_scale_turbulent;	// turbulent time scale: [C]
+	T *length_scale_mixing;		// mixing length scale: [W]
+	T *length_scale_kolmogorov;	// kolmogorov length scale: [C]
+
+#ifdef COMPUTE_XT_AVERAGES
+	T *Uyz_deviation, *Vyz_deviation, *Wyz_deviation;	// deviations: [C, V, W] 
+	T *UVyz_flux, *UWyz_flux, *VWyz_flux;				// ui'*uj': [V, W, VW]
+
+	T *Uyz_grad, *Vyz_grad, *Wyz_grad;					// -z velocity gradients: [W, VW, C]
+#endif
+
+
+#ifdef STRATIFICATION
+	T *T_deviation,							// deviation: [C]
+		*TU_flux, *TV_flux, *TW_flux,		// T'*ui': [C, C, W]
+		*TP_flux,							// T'*p': [C]
+		*T2W_flux,							// T'*T'*w': [W]
+		*TUW_flux, *TVW_flux, *TWW_flux;	// T'*ui'*w': [W, W, C]
+
+	T *T_grad;								// -z temperature gradient: [W]
+
+
+	T *heat_balance,						// heat balance eq.: [W]
+		*turbulent_heat_flux, *heat_stress;
+
+
+	T *TPE,									// TPE (1/2*T'T')*Ri(b)/[dT/dz] & shares: [C]
+		*TKE_share, *TPE_share;
+
+
+	T *TVA_balance,							// TVA balance eq.: [C]
+		*TVA_production, *TVA_transport,
+		*TVA_dissipation, *TVA_iso_dissipation;
+
+	T *TPE_balance,							// TPE balance eq.: [C]
+		*TPE_heat_flux, *TPE_transport,
+		*TPE_dissipation, *TPE_iso_dissipation;
+
+
+	T *T_dpdx_turb, *T_dpdy_turb, *T_dpdz_turb;		// [computing T'*grad(P')]: [C,C,W]
+
+	
+	T *Tu_budget_balance,					// T'*u' budget balance: [C]
+		*Tu_production, *Tu_production_shear, *Tu_production_gradT,
+		*Tu_diffusion, *Tu_transport, *Tu_pressure_work,
+		*Tu_pressure_gradT,
+		*Tu_dissipation;
+
+	T *Tv_budget_balance,					// T'*v' budget balance: [C]
+		*Tv_production, *Tv_production_shear, *Tv_production_gradT,
+		*Tv_diffusion, *Tv_transport, *Tv_pressure_work,
+		*Tv_pressure_gradT,
+		*Tv_dissipation;
+
+	T *Tw_budget_balance,					// T'*w' budget balance: [W]
+		*Tw_production, *Tw_production_shear, *Tw_production_gradT,
+		*Tw_diffusion, *Tw_transport, *Tw_pressure_work,
+		*Tw_pressure_gradT,
+		*Tw_buoyancy,
+		*Tw_dissipation;
+
+
+	T *Rotta_buoyancy_e, *Rotta_buoyancy_b;		// Rotta-buoyancy "return-to-isotropy" model constants: [C]
+	T *RDT_buoyancy_p, *RDT_buoyancy_b;			// RDT-buoyancy "return-to-isotropy" model constants: [C]
+	T *u_Rotta_TPE, *v_Rotta_TPE, *w_Rotta_TPE;	// Rotta-TPE "return-to-isotropy" model constants: [C]
+
+	T *Prandtl_turbulent,			// Prandtl turbulent number:	[W]
+		*Richardson_gradient,		// Richardson gradient number:	[W]
+		*Richardson_flux;			// Richardson flux number: [W]
+
+	T *Reynolds_buoyancy;			// Re(b) = (e(TKE) / (nu*Ri(b)*[dT/dz])): [C]
+	T *Froude_horizontal;			// Fr(h) = (1/N)*[0.5*e(TKE)/(E(u)+E(v))]: [C]
+
+	T *time_scale_Tvariance;		// temperature variance time scale: [C]
+	T *length_scale_ellison;		// ellison (overturning) length scale: [W]
+	T *length_scale_ozmidov;		// ozmidov length scale: [C]
+	T *length_scale_obukhov;		// obukhov length scale: [W]
+
+	T *mixing_efficiency;			// mixing efficiency, e(TVA) / e(TKE): [C]
+
+	T *turb_production_ratio;		// P(TKE) / P(TVA): [C]
+
+#ifdef COMPUTE_XT_AVERAGES
+	T *Tyz_deviation;				// deviation: [C]
+	T *TWyz_flux;					// 2nd central moment: [W]
+
+	T *Tyz_grad;					// -z temperature gradient: [W]
+
+	T *Richardson_gradient_yz;		// Richardson gradient number (y,z): [W]
+#endif
+
+#endif
+
+
+	template< typename GType >
+	void init(const nse::Grid3d< GType >& grid);
+	void clear();
+
+	nseTurbVec();
+	~nseTurbVec();
+};
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+template< typename T >
+nseTurbVec< T >::nseTurbVec() : nseAvgVec< T >() {}
+template< typename T >
+nseTurbVec< T >::~nseTurbVec() { clear(); }
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+template< typename GType >
+void nseTurbVec< T >::init(const nse::Grid3d< GType >& grid)
+{
+	nseAvgVec< T >::init(grid);	// allocate base
+
+	nse::allocate_vnull(&U_deviation, &V_deviation, &W_deviation, grid.nz);
+	nse::allocate_vnull(&UV_flux, &UW_flux, &VW_flux, grid.nz);
+	nse::allocate_vnull(&PU_flux, &PV_flux, &PW_flux, grid.nz);
+	nse::allocate_vnull(&U2W_flux, &V2W_flux, &W2W_flux, grid.nz);
+	nse::allocate_vnull(&UVW_flux, &UWW_flux, &VWW_flux, grid.nz);
+
+	nse::allocate_vnull(&U_grad, &V_grad, &W_grad, grid.nz);
+
+
+	nse::allocate_vnull(&momentum_balance,
+		&turbulent_momentum_flux, &viscous_stress, grid.nz);
+
+
+	nse::allocate_vnull(&TKE, &u_TKE, &v_TKE, &w_TKE, grid.nz);
+	nse::allocate_vnull(&u_TKE_share, &v_TKE_share, &w_TKE_share, grid.nz);
+
+
+	nse::allocate_vnull(&TKE_balance, &TKE_production, &TKE_diffusion,
+		&TKE_transport, &TKE_pressure_work, &TKE_exchange_balance,
+#ifdef STRATIFICATION
+		&TKE_heat_flux,
+#endif
+		&TKE_dissipation, &TKE_iso_dissipation, grid.nz);
+
+	nse::allocate_vnull(&u_TKE_balance, &u_TKE_production, &u_TKE_diffusion, 
+		&u_TKE_transport, &u_TKE_pressure_work, &u_TKE_exchange, 
+		&u_TKE_dissipation, &u_TKE_iso_dissipation, grid.nz);
+
+	nse::allocate_vnull(&v_TKE_balance, &v_TKE_production, &v_TKE_diffusion,
+		&v_TKE_transport, &v_TKE_pressure_work, &v_TKE_exchange, 
+		&v_TKE_dissipation, &v_TKE_iso_dissipation, grid.nz);
+
+	nse::allocate_vnull(&w_TKE_balance, &w_TKE_production, &w_TKE_diffusion,
+		&w_TKE_transport, &w_TKE_pressure_work, &w_TKE_exchange, 
+#ifdef STRATIFICATION
+		&w_TKE_heat_flux,
+#endif
+		&w_TKE_dissipation, &w_TKE_iso_dissipation, grid.nz);
+
+	
+	nse::allocate_vnull(&TKE_aniso_uu, &TKE_aniso_vv, &TKE_aniso_ww, grid.nz);
+	nse::allocate_vnull(&TKE_aniso_uv, &TKE_aniso_uw, &TKE_aniso_vw, grid.nz);
+
+
+	nse::allocate_vnull(&P2Suv_turb, &P2Suw_turb, &P2Svw_turb, grid.nz);
+	nse::allocate_vnull(&P2Suw_turb_c, &P2Svw_turb_c, grid.nz);
+
+
+	nse::allocate_vnull(&uv_budget_balance,
+		&uv_production, &uv_production_shearU, &uv_production_shearV,
+		&uv_diffusion, &uv_transport, &uv_pressure_work,
+		&uv_dissipation, &uv_iso_dissipation, grid.nz);
+
+	nse::allocate_vnull(&uw_budget_balance, 
+		&uw_production, &uw_production_shearU, &uw_production_shearW,
+		&uw_diffusion, &uw_transport, &uw_pressure_work,
+#ifdef STRATIFICATION
+		&uw_buoyancy,
+#endif
+		&uw_dissipation, &uw_iso_dissipation, grid.nz);
+
+	nse::allocate_vnull(&vw_budget_balance, 
+		&vw_production, &vw_production_shearV, &vw_production_shearW,
+		&vw_diffusion, &vw_transport, &vw_pressure_work,
+#ifdef STRATIFICATION
+		&vw_buoyancy,
+#endif
+		&vw_dissipation, &vw_iso_dissipation, grid.nz);
+
+
+	nse::allocate_vnull(&u_Rotta, &v_Rotta, &w_Rotta, &uw_Rotta, grid.nz);
+	nse::allocate_vnull(&u_RDT, &v_RDT, &w_RDT, &uw_RDT, grid.nz);
+	nse::allocate_vnull(&Rotta_RDT_e, &Rotta_RDT_p, grid.nz);
+
+	nse::allocate_vnull(&time_scale_turbulent, grid.nz);
+	nse::allocate_vnull(&length_scale_mixing, grid.nz);
+	nse::allocate_vnull(&length_scale_kolmogorov, grid.nz);
+
+#ifdef COMPUTE_XT_AVERAGES
+	nse::allocate_vnull(&Uyz_deviation, &Vyz_deviation, &Wyz_deviation, grid.nyz);
+	nse::allocate_vnull(&UVyz_flux, &UWyz_flux, &VWyz_flux, grid.nyz);
+	nse::allocate_vnull(&Uyz_grad, &Vyz_grad, &Wyz_grad, grid.nyz);
+#endif
+
+
+#ifdef STRATIFICATION
+	nse::allocate_vnull(&T_deviation, grid.nz);
+	nse::allocate_vnull(&TU_flux, &TV_flux, &TW_flux, grid.nz);
+	nse::allocate_vnull(&TP_flux, grid.nz);
+	nse::allocate_vnull(&T2W_flux, grid.nz);
+	nse::allocate_vnull(&TUW_flux, &TVW_flux, &TWW_flux, grid.nz);
+
+	nse::allocate_vnull(&T_grad, grid.nz);
+
+
+	nse::allocate_vnull(&heat_balance,
+		&turbulent_heat_flux, &heat_stress, grid.nz);
+
+
+	nse::allocate_vnull(&TPE, grid.nz);
+	nse::allocate_vnull(&TKE_share, &TPE_share, grid.nz);
+
+	nse::allocate_vnull(&TVA_balance, 
+		&TVA_production, &TVA_transport, 
+		&TVA_dissipation, &TVA_iso_dissipation, grid.nz);
+
+	nse::allocate_vnull(&TPE_balance, 
+		&TPE_heat_flux, &TPE_transport, 
+		&TPE_dissipation, &TPE_iso_dissipation, grid.nz);
+
+
+	nse::allocate_vnull(&T_dpdx_turb, &T_dpdy_turb, &T_dpdz_turb, grid.nz);
+
+
+	nse::allocate_vnull(&Tu_budget_balance, &Tu_production, 
+		&Tu_production_shear, &Tu_production_gradT,
+		&Tu_diffusion, &Tu_transport, &Tu_pressure_work,
+		&Tu_pressure_gradT, &Tu_dissipation, grid.nz);
+
+	nse::allocate_vnull(&Tv_budget_balance, &Tv_production, 
+		&Tv_production_shear, &Tv_production_gradT,
+		&Tv_diffusion, &Tv_transport, &Tv_pressure_work,
+		&Tv_pressure_gradT, &Tv_dissipation, grid.nz);
+
+	nse::allocate_vnull(&Tw_budget_balance, &Tw_production, 
+		&Tw_production_shear, &Tw_production_gradT,
+		&Tw_diffusion, &Tw_transport, &Tw_pressure_work, 
+		&Tw_pressure_gradT, &Tw_buoyancy, &Tw_dissipation, grid.nz);
+
+
+	nse::allocate_vnull(&Rotta_buoyancy_e, &Rotta_buoyancy_b, grid.nz);
+	nse::allocate_vnull(&RDT_buoyancy_p, &RDT_buoyancy_b, grid.nz);
+	nse::allocate_vnull(&u_Rotta_TPE, &v_Rotta_TPE, &w_Rotta_TPE, grid.nz);
+
+	nse::allocate_vnull(&Prandtl_turbulent, grid.nz);
+	nse::allocate_vnull(&Richardson_gradient, grid.nz);
+	nse::allocate_vnull(&Richardson_flux, grid.nz);
+
+	nse::allocate_vnull(&Reynolds_buoyancy, grid.nz);
+	nse::allocate_vnull(&Froude_horizontal, grid.nz);
+
+	nse::allocate_vnull(&time_scale_Tvariance, grid.nz);
+	nse::allocate_vnull(&length_scale_ellison, grid.nz);
+	nse::allocate_vnull(&length_scale_ozmidov, grid.nz);
+	nse::allocate_vnull(&length_scale_obukhov, grid.nz);
+
+	nse::allocate_vnull(&mixing_efficiency, grid.nz);
+	nse::allocate_vnull(&turb_production_ratio, grid.nz);
+
+#ifdef COMPUTE_XT_AVERAGES
+	nse::allocate_vnull(&Tyz_deviation, grid.nyz);
+	nse::allocate_vnull(&TWyz_flux, grid.nyz);
+	nse::allocate_vnull(&Tyz_grad, grid.nyz);
+	nse::allocate_vnull(&Richardson_gradient_yz, grid.nyz);
+#endif
+#endif
+}
+// -------------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nseTurbVec< T >::clear()
+{
+	if (nseAvgVec< T >::status)
+	{
+		nse::deallocate(U_deviation, V_deviation, W_deviation);
+		nse::deallocate(UV_flux, UW_flux, VW_flux);
+		nse::deallocate(PU_flux, PV_flux, PW_flux);
+		nse::deallocate(U2W_flux, V2W_flux, W2W_flux);
+		nse::deallocate(UVW_flux, UWW_flux, VWW_flux);
+
+		nse::deallocate(U_grad, V_grad, W_grad);
+
+
+		nse::deallocate(momentum_balance,
+			turbulent_momentum_flux, viscous_stress);
+
+
+		nse::deallocate(TKE, u_TKE, v_TKE, w_TKE);
+		nse::deallocate(u_TKE_share, v_TKE_share, w_TKE_share);
+
+
+		nse::deallocate(TKE_balance, TKE_production, TKE_diffusion,
+			TKE_transport, TKE_pressure_work, TKE_exchange_balance,
+#ifdef STRATIFICATION
+			TKE_heat_flux,
+#endif
+			TKE_dissipation, TKE_iso_dissipation);
+
+		nse::deallocate(u_TKE_balance, u_TKE_production, u_TKE_diffusion,
+			u_TKE_transport, u_TKE_pressure_work, u_TKE_exchange, 
+			u_TKE_dissipation, u_TKE_iso_dissipation);
+
+		nse::deallocate(v_TKE_balance, v_TKE_production, v_TKE_diffusion,
+			v_TKE_transport, v_TKE_pressure_work, v_TKE_exchange, 
+			v_TKE_dissipation, v_TKE_iso_dissipation);
+
+		nse::deallocate(w_TKE_balance, w_TKE_production, w_TKE_diffusion,
+			w_TKE_transport, w_TKE_pressure_work, w_TKE_exchange,
+#ifdef STRATIFICATION
+			w_TKE_heat_flux,
+#endif
+			w_TKE_dissipation, w_TKE_iso_dissipation);
+
+
+		nse::deallocate(TKE_aniso_uu, TKE_aniso_vv, TKE_aniso_ww);
+		nse::deallocate(TKE_aniso_uv, TKE_aniso_uw, TKE_aniso_vw);
+
+
+		nse::deallocate(P2Suv_turb, P2Suw_turb, P2Svw_turb);
+		nse::deallocate(P2Suw_turb_c, P2Svw_turb_c);
+
+
+		nse::deallocate(uv_budget_balance,
+			uv_production, uv_production_shearU, uv_production_shearV,
+			uv_diffusion, uv_transport, uv_pressure_work,
+			uv_dissipation, uv_iso_dissipation);
+
+		nse::deallocate(uw_budget_balance, 
+			uw_production, uw_production_shearU, uw_production_shearW,
+			uw_diffusion, uw_transport, uw_pressure_work,
+#ifdef STRATIFICATION
+			uw_buoyancy,
+#endif
+			uw_dissipation, uw_iso_dissipation);
+
+		nse::deallocate(vw_budget_balance, 
+			vw_production, vw_production_shearV, vw_production_shearW,
+			vw_diffusion, vw_transport, vw_pressure_work,
+#ifdef STRATIFICATION
+			vw_buoyancy,
+#endif
+			vw_dissipation, vw_iso_dissipation);
+
+
+		nse::deallocate(u_Rotta, v_Rotta, w_Rotta, uw_Rotta);
+		nse::deallocate(u_RDT, v_RDT, w_RDT, uw_RDT);
+		nse::deallocate(Rotta_RDT_e, Rotta_RDT_p);
+
+		nse::deallocate(time_scale_turbulent);
+		nse::deallocate(length_scale_mixing);
+		nse::deallocate(length_scale_kolmogorov);
+
+#ifdef COMPUTE_XT_AVERAGES
+		nse::deallocate(Uyz_deviation, Vyz_deviation, Wyz_deviation);
+		nse::deallocate(UVyz_flux, UWyz_flux, VWyz_flux);
+		nse::deallocate(Uyz_grad, Vyz_grad, Wyz_grad);
+#endif
+
+
+#ifdef STRATIFICATION
+		nse::deallocate(T_deviation);
+		nse::deallocate(TU_flux, TV_flux, TW_flux);
+		nse::deallocate(TP_flux);
+		nse::deallocate(T2W_flux);
+		nse::deallocate(TUW_flux, TVW_flux, TWW_flux);
+
+		nse::deallocate(T_grad);
+
+
+		nse::deallocate(heat_balance,
+			turbulent_heat_flux, heat_stress);
+
+
+		nse::deallocate(TPE);
+		nse::deallocate(TKE_share, TPE_share);
+
+		nse::deallocate(TVA_balance,
+			TVA_production, TVA_transport,
+			TVA_dissipation, TVA_iso_dissipation);
+
+		nse::deallocate(TPE_balance,
+			TPE_heat_flux, TPE_transport,
+			TPE_dissipation, TPE_iso_dissipation);
+
+
+		nse::deallocate(T_dpdx_turb, T_dpdy_turb, T_dpdz_turb);
+
+
+		nse::deallocate(Tu_budget_balance, Tu_production,
+			Tu_production_shear, Tu_production_gradT,
+			Tu_diffusion, Tu_transport, Tu_pressure_work,
+			Tu_pressure_gradT, Tu_dissipation);
+
+		nse::deallocate(Tv_budget_balance, Tv_production,
+			Tv_production_shear, Tv_production_gradT,
+			Tv_diffusion, Tv_transport, Tv_pressure_work,
+			Tv_pressure_gradT, Tv_dissipation);
+
+		nse::deallocate(Tw_budget_balance, Tw_production,
+			Tw_production_shear, Tw_production_gradT,
+			Tw_diffusion, Tw_transport, Tw_pressure_work,
+			Tw_pressure_gradT, Tw_buoyancy, Tw_dissipation);
+
+
+		nse::deallocate(Rotta_buoyancy_e, Rotta_buoyancy_b);
+		nse::deallocate(RDT_buoyancy_p, RDT_buoyancy_b);
+		nse::deallocate(u_Rotta_TPE, v_Rotta_TPE, w_Rotta_TPE);
+
+		nse::deallocate(Prandtl_turbulent);
+		nse::deallocate(Richardson_gradient);
+		nse::deallocate(Richardson_flux);
+
+		nse::deallocate(Reynolds_buoyancy);
+		nse::deallocate(Froude_horizontal);
+
+		nse::deallocate(time_scale_Tvariance);
+		nse::deallocate(length_scale_ellison);
+		nse::deallocate(length_scale_ozmidov);
+		nse::deallocate(length_scale_obukhov);
+
+		nse::deallocate(mixing_efficiency);
+		nse::deallocate(turb_production_ratio);
+
+#ifdef COMPUTE_XT_AVERAGES
+		nse::deallocate(Tyz_deviation);
+		nse::deallocate(TWyz_flux);
+		nse::deallocate(Tyz_grad);
+		nse::deallocate(Richardson_gradient_yz);
+#endif
+#endif
+
+	}
+
+	nseAvgVec< T >::clear();		// deallocate base
+}
+// ------------------------------------------------------------------------------------------------ //
diff --git a/nse3d-x4.cpp b/nse3d-x4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..80ec95cf04ae3d0a36851ed8a07503ecbf2fb0c6
--- /dev/null
+++ b/nse3d-x4.cpp
@@ -0,0 +1,3925 @@
+#include "nse3d-x4.h"
+
+#include <math.h>
+
+using namespace nse::nse_const3d;
+
+
+// * advection * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_advection_div_x4(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = -(
+
+					// d(UU)/dx
+					(
+					C1 * (
+					C1 *
+					(U[idx + grid.nyz] - U[idx - grid.nyz]) *
+					(U[idx + grid.nyz] + (T) 2.0 * U[idx] + U[idx - grid.nyz]) -
+					_3C2 * (
+					(U[idx + 2 * grid.nyz] + U[idx - grid.nyz]) * (U[idx + grid.nyz] + U[idx]) -
+					(U[idx - 2 * grid.nyz] + U[idx + grid.nyz]) * (U[idx - grid.nyz] + U[idx]))
+					) -
+
+					C2 * (
+					C1 * (
+					(U[idx + 2 * grid.nyz] + U[idx + grid.nyz]) * (U[idx + 3 * grid.nyz] + U[idx]) -
+					(U[idx - 2 * grid.nyz] + U[idx - grid.nyz]) * (U[idx - 3 * grid.nyz] + U[idx])) -
+					_3C2 *
+					(U[idx + 3 * grid.nyz] - U[idx - 3 * grid.nyz]) *
+					(U[idx + 3 * grid.nyz] + (T) 2.0 * U[idx] + U[idx - 3 * grid.nyz])
+					)
+
+					) * grid.dxiq +
+					// ------------
+
+					// d(UV)/dy
+					(
+					C1 * (
+					C1 * (
+					(V[idx - grid.nyz + grid.nz] + V[idx + grid.nz]) * (U[idx + grid.nz] + U[idx]) -
+					(V[idx - grid.nyz] + V[idx]) * (U[idx - grid.nz] + U[idx])) -
+					_3C2 * (
+					(V[idx + grid.nyz + grid.nz] + V[idx - 2 * grid.nyz + grid.nz]) * (U[idx + grid.nz] + U[idx]) -
+					(V[idx + grid.nyz] + V[idx - 2 * grid.nyz]) * (U[idx - grid.nz] + U[idx]))
+					) -
+
+					C2 * (
+					C1 * (
+					(V[idx - grid.nyz + 2 * grid.nz] + V[idx + 2 * grid.nz]) * (U[idx + 3 * grid.nz] + U[idx]) -
+					(V[idx - grid.nyz - grid.nz] + V[idx - grid.nz]) * (U[idx - 3 * grid.nz] + U[idx])) -
+					_3C2 * (
+					(V[idx - 2 * grid.nyz + 2 * grid.nz] + V[idx + grid.nyz + 2 * grid.nz]) * (U[idx + 3 * grid.nz] + U[idx]) -
+					(V[idx - 2 * grid.nyz - grid.nz] + V[idx + grid.nyz - grid.nz]) * (U[idx - 3 * grid.nz] + U[idx]))
+					)
+
+					) * grid.dyiq +
+					// ------------
+
+					// d(UW)/dz
+					(
+					(U[idx] + U[idx + 1]) *
+					(C1 * (W[idx + 1] + W[idx - grid.nyz + 1]) - _3C2 * (W[idx + grid.nyz + 1] + W[idx - 2 * grid.nyz + 1]))
+					-
+					(U[idx] + U[idx - 1]) *
+					(C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz]))
+					) * grid.dziq[k]);
+				// ------------
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_advection_div_x4(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = -(
+
+					// d(UV)/dx
+					(
+					C1 * (
+					C1 * (
+					(U[idx - grid.nz + grid.nyz] + U[idx + grid.nyz]) * (V[idx + grid.nyz] + V[idx]) -
+					(U[idx - grid.nz] + U[idx]) * (V[idx - grid.nyz] + V[idx])) -
+					_3C2 * (
+					(U[idx + grid.nz + grid.nyz] + U[idx - 2 * grid.nz + grid.nyz]) * (V[idx + grid.nyz] + V[idx]) -
+					(U[idx + grid.nz] + U[idx - 2 * grid.nz]) * (V[idx - grid.nyz] + V[idx]))
+					) -
+
+					C2 * (
+					C1 * (
+					(U[idx - grid.nz + 2 * grid.nyz] + U[idx + 2 * grid.nyz]) * (V[idx + 3 * grid.nyz] + V[idx]) -
+					(U[idx - grid.nz - grid.nyz] + U[idx - grid.nyz]) * (V[idx - 3 * grid.nyz] + V[idx])) -
+					_3C2 * (
+					(U[idx - 2 * grid.nz + 2 * grid.nyz] + U[idx + grid.nz + 2 * grid.nyz]) * (V[idx + 3 * grid.nyz] + V[idx]) -
+					(U[idx - 2 * grid.nz - grid.nyz] + U[idx + grid.nz - grid.nyz]) * (V[idx - 3 * grid.nyz] + V[idx]))
+					)
+
+					) * grid.dxiq +
+					// ------------
+
+					// d(VV)/dy
+					(
+					C1 * (
+					C1 *
+					(V[idx + grid.nz] - V[idx - grid.nz]) *
+					(V[idx + grid.nz] + (T) 2.0 * V[idx] + V[idx - grid.nz]) -
+					_3C2 * (
+					(V[idx + 2 * grid.nz] + V[idx - grid.nz]) * (V[idx + grid.nz] + V[idx]) -
+					(V[idx - 2 * grid.nz] + V[idx + grid.nz]) * (V[idx - grid.nz] + V[idx]))
+					) -
+
+					C2 * (
+					C1 * (
+					(V[idx + 2 * grid.nz] + V[idx + grid.nz]) * (V[idx + 3 * grid.nz] + V[idx]) -
+					(V[idx - 2 * grid.nz] + V[idx - grid.nz]) * (V[idx - 3 * grid.nz] + V[idx])) -
+					_3C2 *
+					(V[idx + 3 * grid.nz] - V[idx - 3 * grid.nz]) *
+					(V[idx + 3 * grid.nz] + (T) 2.0 * V[idx] + V[idx - 3 * grid.nz])
+					)
+
+					) * grid.dyiq +
+					// ----------
+
+					// d(VW)/dz
+					(
+					(V[idx] + V[idx + 1]) *
+					(C1 * (W[idx + 1] + W[idx - grid.nz + 1]) - _3C2 * (W[idx + grid.nz + 1] + W[idx - 2 * grid.nz + 1]))
+					-
+					(V[idx] + V[idx - 1]) *
+					(C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz]))
+					) * grid.dziq[k]);
+				// ------------
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_advection_div_x4(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = -(
+
+					// d(UW)/dx
+					(
+					C1 * (
+					(U[idx - 1 + grid.nyz] + U[idx + grid.nyz]) * (W[idx + grid.nyz] + W[idx]) -
+					(U[idx - 1] + U[idx]) * (W[idx - grid.nyz] + W[idx])) -
+					C2 * (
+					(U[idx - 1 + 2 * grid.nyz] + U[idx + 2 * grid.nyz]) * (W[idx + 3 * grid.nyz] + W[idx]) -
+					(U[idx - 1 - grid.nyz] + U[idx - grid.nyz]) * (W[idx - 3 * grid.nyz] + W[idx]))
+					) * grid.dxiq +
+					// ------------
+
+					// d(VW)/dy
+					(
+					C1 * (
+					(V[idx - 1 + grid.nz] + V[idx + grid.nz]) * (W[idx + grid.nz] + W[idx]) -
+					(V[idx - 1] + V[idx]) * (W[idx - grid.nz] + W[idx])) -
+					C2 * (
+					(V[idx - 1 + 2 * grid.nz] + V[idx + 2 * grid.nz]) * (W[idx + 3 * grid.nz] + W[idx]) -
+					(V[idx - 1 - grid.nz] + V[idx - grid.nz]) * (W[idx - 3 * grid.nz] + W[idx]))
+					) * grid.dyiq +
+					// ------------
+
+					// d(WW)/dz
+					(W[idx + 1] - W[idx - 1]) *
+					(W[idx + 1] + W[idx - 1] + W[idx] + W[idx]) * grid.dzmih[k]);
+				// ----------
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::u_advection_skew_x4(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = -(
+
+					// d(UU)/dx
+					(
+					C1 * (
+					C1 * (
+					U[idx + grid.nyz] * (U[idx] + U[idx + grid.nyz]) -
+					U[idx - grid.nyz] * (U[idx] + U[idx - grid.nyz])) -
+					_3C2 * (
+					U[idx + grid.nyz] * (U[idx - grid.nyz] + U[idx + 2 * grid.nyz]) -
+					U[idx - grid.nyz] * (U[idx - 2 * grid.nyz] + U[idx + grid.nyz]))
+					) -
+
+					C2 * (
+					C1 * (
+					U[idx + 3 * grid.nyz] * (U[idx + 2 * grid.nyz] + U[idx + grid.nyz]) -
+					U[idx - 3 * grid.nyz] * (U[idx - 2 * grid.nyz] + U[idx - grid.nyz])) -
+					_3C2 * (
+					U[idx + 3 * grid.nyz] * (U[idx] + U[idx + 3 * grid.nyz]) -
+					U[idx - 3 * grid.nyz] * (U[idx] + U[idx - 3 * grid.nyz]))
+					)
+
+					) * grid.dxiq +
+					// ------------
+
+					// d(UV)/dy
+					(
+					C1 * (
+					C1 * (
+					U[idx + grid.nz] * (V[idx - grid.nyz + grid.nz] + V[idx + grid.nz]) -
+					U[idx - grid.nz] * (V[idx - grid.nyz] + V[idx])) -
+					_3C2 * (
+					U[idx + grid.nz] * (V[idx + grid.nyz + grid.nz] + V[idx - 2 * grid.nyz + grid.nz]) -
+					U[idx - grid.nz] * (V[idx + grid.nyz] + V[idx - 2 * grid.nyz]))
+					) -
+
+					C2 * (
+					C1 * (
+					U[idx + 3 * grid.nz] * (V[idx - grid.nyz + 2 * grid.nz] + V[idx + 2 * grid.nz]) -
+					U[idx - 3 * grid.nz] * (V[idx - grid.nyz - grid.nz] + V[idx - grid.nz])) -
+					_3C2 * (
+					U[idx + 3 * grid.nz] * (V[idx - 2 * grid.nyz + 2 * grid.nz] + V[idx + grid.nyz + 2 * grid.nz]) -
+					U[idx - 3 * grid.nz] * (V[idx - 2 * grid.nyz - grid.nz] + V[idx + grid.nyz - grid.nz]))
+					)
+
+					) * grid.dyiq +
+					// ------------
+
+					// d(UW)/dz
+					(
+					U[idx + 1] *
+					(C1 * (W[idx + 1] + W[idx - grid.nyz + 1]) - _3C2 * (W[idx + grid.nyz + 1] + W[idx - 2 * grid.nyz + 1]))
+					-
+					U[idx - 1] *
+					(C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz]))
+					) * grid.dziq[k]);
+				// ------------
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_advection_skew_x4(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = -(
+
+					// d(UV)/dx
+					(
+					C1 * (
+					C1 * (
+					V[idx + grid.nyz] * (U[idx - grid.nz + grid.nyz] + U[idx + grid.nyz]) -
+					V[idx - grid.nyz] * (U[idx - grid.nz] + U[idx])) -
+					_3C2 * (
+					V[idx + grid.nyz] * (U[idx + grid.nz + grid.nyz] + U[idx - 2 * grid.nz + grid.nyz]) -
+					V[idx - grid.nyz] * (U[idx + grid.nz] + U[idx - 2 * grid.nz]))
+					) -
+
+					C2 * (
+					C1 * (
+					V[idx + 3 * grid.nyz] * (U[idx - grid.nz + 2 * grid.nyz] + U[idx + 2 * grid.nyz]) -
+					V[idx - 3 * grid.nyz] * (U[idx - grid.nz - grid.nyz] + U[idx - grid.nyz])) -
+					_3C2 * (
+					V[idx + 3 * grid.nyz] * (U[idx - 2 * grid.nz + 2 * grid.nyz] + U[idx + grid.nz + 2 * grid.nyz]) -
+					V[idx - 3 * grid.nyz] * (U[idx - 2 * grid.nz - grid.nyz] + U[idx + grid.nz - grid.nyz]))
+					)
+
+					) * grid.dxiq +
+					// ------------
+
+					// d(VV)/dy
+					(
+					C1 * (
+					C1 * (
+					V[idx + grid.nz] * (V[idx] + V[idx + grid.nz]) -
+					V[idx - grid.nz] * (V[idx] + V[idx - grid.nz])) -
+					_3C2 * (
+					V[idx + grid.nz] * (V[idx - grid.nz] + V[idx + 2 * grid.nz]) -
+					V[idx - grid.nz] * (V[idx - 2 * grid.nz] + V[idx + grid.nz]))
+					) -
+
+					C2 * (
+					C1 * (
+					V[idx + 3 * grid.nz] * (V[idx + 2 * grid.nz] + V[idx + grid.nz]) -
+					V[idx - 3 * grid.nz] * (V[idx - 2 * grid.nz] + V[idx - grid.nz])) -
+					_3C2 * (
+					V[idx + 3 * grid.nz] * (V[idx] + V[idx + 3 * grid.nz]) -
+					V[idx - 3 * grid.nz] * (V[idx] + V[idx - 3 * grid.nz]))
+					)
+
+					) * grid.dyiq +
+					// ----------
+
+					// d(VW)/dz
+					(
+					V[idx + 1] *
+					(C1 * (W[idx + 1] + W[idx - grid.nz + 1]) - _3C2 * (W[idx + grid.nz + 1] + W[idx - 2 * grid.nz + 1]))
+					-
+					V[idx - 1] *
+					(C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz]))
+					) * grid.dziq[k]);
+				// ------------
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_advection_skew_x4(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = -(
+
+					// d(UW)/dx
+					(
+					C1 * (
+					W[idx + grid.nyz] * (U[idx - 1 + grid.nyz] + U[idx + grid.nyz]) -
+					W[idx - grid.nyz] * (U[idx - 1] + U[idx])) -
+					C2 * (
+					W[idx + 3 * grid.nyz] * (U[idx - 1 + 2 * grid.nyz] + U[idx + 2 * grid.nyz]) -
+					W[idx - 3 * grid.nyz] * (U[idx - 1 - grid.nyz] + U[idx - grid.nyz]))
+					) * grid.dxiq +
+					// ------------
+
+					// d(VW)/dy
+					(
+					C1 * (
+					W[idx + grid.nz] * (V[idx - 1 + grid.nz] + V[idx + grid.nz]) -
+					W[idx - grid.nz] * (V[idx - 1] + V[idx])) -
+					C2 * (
+					W[idx + 3 * grid.nz] * (V[idx - 1 + 2 * grid.nz] + V[idx + 2 * grid.nz]) -
+					W[idx - 3 * grid.nz] * (V[idx - 1 - grid.nz] + V[idx - grid.nz]))
+					) * grid.dyiq +
+					// ------------
+
+					// d(WW)/dz
+					(
+					W[idx + 1] * (W[idx] + W[idx + 1]) -
+					W[idx - 1] * (W[idx] + W[idx - 1])) * grid.dzmih[k]);
+				// ----------
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * scalar advection * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::c_advection_div_x4(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T* _RESTRICT const X,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = -(
+
+					// d(CU)/dx
+					(
+					C1 * (
+					U[idx + grid.nyz] * (X[idx] + X[idx + grid.nyz]) -
+					U[idx] * (X[idx] + X[idx - grid.nyz])
+					) -
+
+					C2 * (
+					U[idx + 2 * grid.nyz] * (X[idx] + X[idx + 3 * grid.nyz]) -
+					U[idx - grid.nyz] * (X[idx - 3 * grid.nyz] + X[idx]))
+
+					) * grid.dxih +
+					// ------------
+
+					// d(CV)/dy
+					(
+					C1 * (
+					V[idx + grid.nz] * (X[idx] + X[idx + grid.nz]) -
+					V[idx] * (X[idx] + X[idx - grid.nz])
+					) -
+
+					C2 * (
+					V[idx + 2 * grid.nz] * (X[idx] + X[idx + 3 * grid.nz]) -
+					V[idx - grid.nz] * (X[idx - 3 * grid.nz] + X[idx]))
+
+					) * grid.dyih +
+					// ------------
+
+					// d(CW)/dz
+					(
+					W[idx + 1] * (X[idx + 1] + X[idx]) -
+					W[idx] * (X[idx] + X[idx - 1])) * grid.dzih[k]);
+				// ------------
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_advection_skew_x4(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T* _RESTRICT const X,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = -(
+
+					// d(CU)/dx
+					(
+					C1 * (
+					U[idx + grid.nyz] * X[idx + grid.nyz] -
+					U[idx] * X[idx - grid.nyz]
+					) -
+
+					C2 * (
+					U[idx + 2 * grid.nyz] * X[idx + 3 * grid.nyz] -
+					U[idx - grid.nyz] * X[idx - 3 * grid.nyz])
+
+					) * grid.dxih +
+					// ------------
+
+					// d(CV)/dy
+					(
+					C1 * (
+					V[idx + grid.nz] * X[idx + grid.nz] -
+					V[idx] * X[idx - grid.nz]
+					) -
+
+					C2 * (
+					V[idx + 2 * grid.nz] * X[idx + 3 * grid.nz] -
+					V[idx - grid.nz] * X[idx - 3 * grid.nz])
+
+					) * grid.dyih +
+					// ------------
+
+					// d(CW)/dz
+					(
+					W[idx + 1] * X[idx + 1] -
+					W[idx] * X[idx - 1]) * grid.dzih[k]);
+				// ------------
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_advection_div_vline_x4(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T X0, const T XH,
+	const wstGrid3d< T >& grid)
+{
+	const T Ch = (XH - X0) / grid.mpi_height;
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	T div_k;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx, div_k ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx, div_k ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				// adding divergence operator for consistency
+				div_k = (X0 + Ch * grid.pz[k]) * (
+					(C1 * (U[idx + grid.nyz] - U[idx]) - C2 * (U[idx + 2 * grid.nyz] - U[idx - grid.nyz])) * grid.dxi +
+					(C1 * (V[idx + grid.nz] - V[idx]) - C2 * (V[idx + 2 * grid.nz] - V[idx - grid.nz])) * grid.dyi +
+					(W[idx + 1] - W[idx]) * grid.dzi[k]);
+
+				// (SKEW-x4)-(DIV-x4) difference in divergence only
+				Xinterm[idx] -= div_k + Ch * (
+					W[idx + 1] * (grid.dz[k] + grid.dz[k + 1]) +
+					W[idx] * (grid.dz[k] + grid.dz[k - 1])) * grid.dziq[k];
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_advection_skew_vline_x4(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T X0, const T XH,
+	const wstGrid3d< T >& grid)
+{
+	const T Ch = (XH - X0) / grid.mpi_height;
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	T div_k;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx, div_k ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx, div_k ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				// adding divergence operator for consistency
+				div_k = (T) 0.5 * (X0 + Ch * grid.pz[k]) * (
+					(C1 * (U[idx + grid.nyz] - U[idx]) - C2 * (U[idx + 2 * grid.nyz] - U[idx - grid.nyz])) * grid.dxi +
+					(C1 * (V[idx + grid.nz] - V[idx]) - C2 * (V[idx + 2 * grid.nz] - V[idx - grid.nz])) * grid.dyi +
+					(W[idx + 1] - W[idx]) * grid.dzi[k]);
+
+				// (SKEW-x4)-(DIV-x4) difference in divergence only
+				Xinterm[idx] -= div_k + Ch * (
+					W[idx + 1] * (grid.dz[k] + grid.dz[k + 1]) +
+					W[idx] * (grid.dz[k] + grid.dz[k - 1])) * grid.dziq[k];
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * diffusion * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_add_diffusion_x4(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] += c_viscosity * (
+
+					(S11 * (U[idx + grid.nyz] - U[idx] - U[idx] + U[idx - grid.nyz]) +
+					S22 * (U[idx + 3 * grid.nyz] - U[idx] - U[idx] + U[idx - 3 * grid.nyz]) -
+					S12 * (U[idx + 2 * grid.nyz] - U[idx + grid.nyz] - U[idx - grid.nyz] + U[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (U[idx + grid.nz] - U[idx] - U[idx] + U[idx - grid.nz]) +
+					S22 * (U[idx + 3 * grid.nz] - U[idx] - U[idx] + U[idx - 3 * grid.nz]) -
+					S12 * (U[idx + 2 * grid.nz] - U[idx + grid.nz] - U[idx - grid.nz] + U[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((U[idx + 1] - U[idx]) * grid.dzp2i[k]
+					- (U[idx] - U[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_add_diffusion_x4(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const V,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] += c_viscosity * (
+
+					(S11 * (V[idx + grid.nyz] - V[idx] - V[idx] + V[idx - grid.nyz]) +
+					S22 * (V[idx + 3 * grid.nyz] - V[idx] - V[idx] + V[idx - 3 * grid.nyz]) -
+					S12 * (V[idx + 2 * grid.nyz] - V[idx + grid.nyz] - V[idx - grid.nyz] + V[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (V[idx + grid.nz] - V[idx] - V[idx] + V[idx - grid.nz]) +
+					S22 * (V[idx + 3 * grid.nz] - V[idx] - V[idx] + V[idx - 3 * grid.nz]) -
+					S12 * (V[idx + 2 * grid.nz] - V[idx + grid.nz] - V[idx - grid.nz] + V[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((V[idx + 1] - V[idx]) * grid.dzp2i[k]
+					- (V[idx] - V[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_add_diffusion_x4(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] += c_viscosity * (
+
+					(S11 * (W[idx + grid.nyz] - W[idx] - W[idx] + W[idx - grid.nyz]) +
+					S22 * (W[idx + 3 * grid.nyz] - W[idx] - W[idx] + W[idx - 3 * grid.nyz]) -
+					S12 * (W[idx + 2 * grid.nyz] - W[idx + grid.nyz] - W[idx - grid.nyz] + W[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (W[idx + grid.nz] - W[idx] - W[idx] + W[idx - grid.nz]) +
+					S22 * (W[idx + 3 * grid.nz] - W[idx] - W[idx] + W[idx - 3 * grid.nz]) -
+					S12 * (W[idx + 2 * grid.nz] - W[idx + grid.nz] - W[idx - grid.nz] + W[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((W[idx + 1] - W[idx]) * grid.dzm2i[k]
+					- (W[idx] - W[idx - 1]) * grid.dzp2i[k - 1]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_add_diffusion_x4(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const X,
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] += c_diffusivity * (
+
+					(S11 * (X[idx + grid.nyz] - X[idx] - X[idx] + X[idx - grid.nyz]) +
+					S22 * (X[idx + 3 * grid.nyz] - X[idx] - X[idx] + X[idx - 3 * grid.nyz]) -
+					S12 * (X[idx + 2 * grid.nyz] - X[idx + grid.nyz] - X[idx - grid.nyz] + X[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (X[idx + grid.nz] - X[idx] - X[idx] + X[idx - grid.nz]) +
+					S22 * (X[idx + 3 * grid.nz] - X[idx] - X[idx] + X[idx - 3 * grid.nz]) -
+					S12 * (X[idx + 2 * grid.nz] - X[idx + grid.nz] - X[idx - grid.nz] + X[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((X[idx + 1] - X[idx]) * grid.dzp2i[k]
+					- (X[idx] - X[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+
+
+template< typename T >
+void nse::u_set_diffusion_x4(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = c_viscosity * (
+
+					(S11 * (U[idx + grid.nyz] - U[idx] - U[idx] + U[idx - grid.nyz]) +
+					S22 * (U[idx + 3 * grid.nyz] - U[idx] - U[idx] + U[idx - 3 * grid.nyz]) -
+					S12 * (U[idx + 2 * grid.nyz] - U[idx + grid.nyz] - U[idx - grid.nyz] + U[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (U[idx + grid.nz] - U[idx] - U[idx] + U[idx - grid.nz]) +
+					S22 * (U[idx + 3 * grid.nz] - U[idx] - U[idx] + U[idx - 3 * grid.nz]) -
+					S12 * (U[idx + 2 * grid.nz] - U[idx + grid.nz] - U[idx - grid.nz] + U[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((U[idx + 1] - U[idx]) * grid.dzp2i[k]
+					- (U[idx] - U[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_set_diffusion_x4(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const V,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = c_viscosity * (
+
+					(S11 * (V[idx + grid.nyz] - V[idx] - V[idx] + V[idx - grid.nyz]) +
+					S22 * (V[idx + 3 * grid.nyz] - V[idx] - V[idx] + V[idx - 3 * grid.nyz]) -
+					S12 * (V[idx + 2 * grid.nyz] - V[idx + grid.nyz] - V[idx - grid.nyz] + V[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (V[idx + grid.nz] - V[idx] - V[idx] + V[idx - grid.nz]) +
+					S22 * (V[idx + 3 * grid.nz] - V[idx] - V[idx] + V[idx - 3 * grid.nz]) -
+					S12 * (V[idx + 2 * grid.nz] - V[idx + grid.nz] - V[idx - grid.nz] + V[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((V[idx + 1] - V[idx]) * grid.dzp2i[k]
+					- (V[idx] - V[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_set_diffusion_x4(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = c_viscosity * (
+
+					(S11 * (W[idx + grid.nyz] - W[idx] - W[idx] + W[idx - grid.nyz]) +
+					S22 * (W[idx + 3 * grid.nyz] - W[idx] - W[idx] + W[idx - 3 * grid.nyz]) -
+					S12 * (W[idx + 2 * grid.nyz] - W[idx + grid.nyz] - W[idx - grid.nyz] + W[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (W[idx + grid.nz] - W[idx] - W[idx] + W[idx - grid.nz]) +
+					S22 * (W[idx + 3 * grid.nz] - W[idx] - W[idx] + W[idx - 3 * grid.nz]) -
+					S12 * (W[idx + 2 * grid.nz] - W[idx + grid.nz] - W[idx - grid.nz] + W[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((W[idx + 1] - W[idx]) * grid.dzm2i[k]
+					- (W[idx] - W[idx - 1]) * grid.dzp2i[k - 1]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_set_diffusion_x4(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const X,
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = c_diffusivity * (
+
+					(S11 * (X[idx + grid.nyz] - X[idx] - X[idx] + X[idx - grid.nyz]) +
+					S22 * (X[idx + 3 * grid.nyz] - X[idx] - X[idx] + X[idx - 3 * grid.nyz]) -
+					S12 * (X[idx + 2 * grid.nyz] - X[idx + grid.nyz] - X[idx - grid.nyz] + X[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (X[idx + grid.nz] - X[idx] - X[idx] + X[idx - grid.nz]) +
+					S22 * (X[idx + 3 * grid.nz] - X[idx] - X[idx] + X[idx - 3 * grid.nz]) -
+					S12 * (X[idx + 2 * grid.nz] - X[idx + grid.nz] - X[idx - grid.nz] + X[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((X[idx + 1] - X[idx]) * grid.dzp2i[k]
+					- (X[idx] - X[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * dissipation operator [ := ] * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_dissipation_x4(
+	T* _RESTRICT Uinterm, const T* _RESTRICT const U,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = c_viscosity * (
+					U[idx] * (
+
+					(S11 * (U[idx + grid.nyz] - U[idx] - U[idx] + U[idx - grid.nyz]) +
+					S22 * (U[idx + 3 * grid.nyz] - U[idx] - U[idx] + U[idx - 3 * grid.nyz]) -
+					S12 * (U[idx + 2 * grid.nyz] - U[idx + grid.nyz] - U[idx - grid.nyz] + U[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (U[idx + grid.nz] - U[idx] - U[idx] + U[idx - grid.nz]) +
+					S22 * (U[idx + 3 * grid.nz] - U[idx] - U[idx] + U[idx - 3 * grid.nz]) -
+					S12 * (U[idx + 2 * grid.nz] - U[idx + grid.nz] - U[idx - grid.nz] + U[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((U[idx + 1] - U[idx]) * grid.dzp2i[k] - (U[idx] - U[idx - 1]) * grid.dzm2i[k])));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_dissipation_x4(
+	T* _RESTRICT Vinterm, const T* _RESTRICT const V,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = c_viscosity * (
+					V[idx] * (
+
+					(S11 * (V[idx + grid.nyz] - V[idx] - V[idx] + V[idx - grid.nyz]) +
+					S22 * (V[idx + 3 * grid.nyz] - V[idx] - V[idx] + V[idx - 3 * grid.nyz]) -
+					S12 * (V[idx + 2 * grid.nyz] - V[idx + grid.nyz] - V[idx - grid.nyz] + V[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (V[idx + grid.nz] - V[idx] - V[idx] + V[idx - grid.nz]) +
+					S22 * (V[idx + 3 * grid.nz] - V[idx] - V[idx] + V[idx - 3 * grid.nz]) -
+					S12 * (V[idx + 2 * grid.nz] - V[idx + grid.nz] - V[idx - grid.nz] + V[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((V[idx + 1] - V[idx]) * grid.dzp2i[k] - (V[idx] - V[idx - 1]) * grid.dzm2i[k])));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_dissipation_x4(
+	T* _RESTRICT Winterm, const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = c_viscosity * (
+					W[idx] * (
+
+					(S11 * (W[idx + grid.nyz] - W[idx] - W[idx] + W[idx - grid.nyz]) +
+					S22 * (W[idx + 3 * grid.nyz] - W[idx] - W[idx] + W[idx - 3 * grid.nyz]) -
+					S12 * (W[idx + 2 * grid.nyz] - W[idx + grid.nyz] - W[idx - grid.nyz] + W[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (W[idx + grid.nz] - W[idx] - W[idx] + W[idx - grid.nz]) +
+					S22 * (W[idx + 3 * grid.nz] - W[idx] - W[idx] + W[idx - 3 * grid.nz]) -
+					S12 * (W[idx + 2 * grid.nz] - W[idx + grid.nz] - W[idx - grid.nz] + W[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((W[idx + 1] - W[idx]) * grid.dzm2i[k] - (W[idx] - W[idx - 1]) * grid.dzp2i[k - 1])));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_dissipation_x4(
+	T* _RESTRICT Xinterm, const T* _RESTRICT const X,
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = c_diffusivity * (
+					X[idx] * (
+
+					(S11 * (X[idx + grid.nyz] - X[idx] - X[idx] + X[idx - grid.nyz]) +
+					S22 * (X[idx + 3 * grid.nyz] - X[idx] - X[idx] + X[idx - 3 * grid.nyz]) -
+					S12 * (X[idx + 2 * grid.nyz] - X[idx + grid.nyz] - X[idx - grid.nyz] + X[idx - 2 * grid.nyz])) * grid.dx2i +
+
+					(S11 * (X[idx + grid.nz] - X[idx] - X[idx] + X[idx - grid.nz]) +
+					S22 * (X[idx + 3 * grid.nz] - X[idx] - X[idx] + X[idx - 3 * grid.nz]) -
+					S12 * (X[idx + 2 * grid.nz] - X[idx + grid.nz] - X[idx - grid.nz] + X[idx - 2 * grid.nz])) * grid.dy2i +
+
+					((X[idx + 1] - X[idx]) * grid.dzp2i[k] - (X[idx] - X[idx - 1]) * grid.dzm2i[k])));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uw_dissipation_x4(T* _RESTRICT UWinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const T* _RESTRICT const U_diffusion,
+	const T* _RESTRICT const W_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UWinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UWinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				UWinterm[idx] = (T)0.25 * (
+					(C1 * (W[idx] + W[idx - grid.nyz]) - _3C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz])) * 
+					(U_diffusion[idx] + U_diffusion[idx - 1]) 
+					+
+					(U[idx] + U[idx - 1]) * 
+					(C1 * (W_diffusion[idx] + W_diffusion[idx - grid.nyz]) - _3C2 * (W_diffusion[idx + grid.nyz] + W_diffusion[idx - 2 * grid.nyz]))
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::vw_dissipation_x4(T* _RESTRICT VWinterm,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T* _RESTRICT const V_diffusion,
+	const T* _RESTRICT const W_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VWinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VWinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				VWinterm[idx] = (T)0.25 * (
+					(C1 * (W[idx] + W[idx - grid.nz]) - _3C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz])) * 
+					(V_diffusion[idx] + V_diffusion[idx - 1]) 
+					+
+					(V[idx] + V[idx - 1]) * 
+					(C1 * (W_diffusion[idx] + W_diffusion[idx - grid.nz]) - _3C2 * (W_diffusion[idx + grid.nz] + W_diffusion[idx - 2 * grid.nz]))
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uv_dissipation_x4(T* _RESTRICT UVinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V,
+	const T* _RESTRICT const U_diffusion,
+	const T* _RESTRICT const V_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UVinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UVinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				UVinterm[idx] = (T)0.25 * (
+					(C1 * (U[idx] + U[idx - grid.nz]) - _3C2 * (U[idx + grid.nz] + U[idx - 2 * grid.nz])) *
+					(C1 * (V_diffusion[idx] + V_diffusion[idx - grid.nyz]) - _3C2 * (V_diffusion[idx + grid.nyz] + V_diffusion[idx - 2 * grid.nyz]))
+					+
+					(C1 * (V[idx] + V[idx - grid.nyz]) - _3C2 * (V[idx + grid.nyz] + V[idx - 2 * grid.nyz])) *
+					(C1 * (U_diffusion[idx] + U_diffusion[idx - grid.nz]) - _3C2 * (U_diffusion[idx + grid.nz] + U_diffusion[idx - 2 * grid.nz]))
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::cu_dissipation_x4(T* _RESTRICT CUinterm,
+	const T* _RESTRICT const X, const T* _RESTRICT const U,
+	const T* _RESTRICT const X_diffusion,
+	const T* _RESTRICT const U_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CUinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CUinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				CUinterm[idx] = (T)0.5 * (
+					(C1 * (X[idx] + X[idx - grid.nyz]) - _3C2 * (X[idx + grid.nyz] + X[idx - 2 * grid.nyz])) *
+					U_diffusion[idx] 
+					+
+					U[idx] * 
+					(C1 * (X_diffusion[idx] + X_diffusion[idx - grid.nyz]) - _3C2 * (X_diffusion[idx + grid.nyz] + X_diffusion[idx - 2 * grid.nyz]))
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::cv_dissipation_x4(T* _RESTRICT CVinterm,
+	const T* _RESTRICT const X, const T* _RESTRICT const V,
+	const T* _RESTRICT const X_diffusion,
+	const T* _RESTRICT const V_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CVinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CVinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				CVinterm[idx] = (T)0.5 * (
+					(C1 * (X[idx] + X[idx - grid.nz]) - _3C2 * (X[idx + grid.nz] + X[idx - 2 * grid.nz])) *
+					V_diffusion[idx] 
+					+
+					V[idx] * 
+					(C1 * (X_diffusion[idx] + X_diffusion[idx - grid.nz]) - _3C2 * (X_diffusion[idx + grid.nz] + X_diffusion[idx - 2 * grid.nz]))
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::cw_dissipation_x4(T* _RESTRICT CWinterm,
+	const T* _RESTRICT const X, const T* _RESTRICT const W,
+	const T* _RESTRICT const X_diffusion,
+	const T* _RESTRICT const W_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CWinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CWinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				CWinterm[idx] = (T)0.5 * (
+					(X[idx] + X[idx - 1]) * W_diffusion[idx] +
+					W[idx] * (X_diffusion[idx] + X_diffusion[idx - 1]));
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * isotropic dissipation operator [ := ] * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_iso_dissipation_x4(
+	T* _RESTRICT Uinterm, const T* _RESTRICT const U,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = c_viscosity * (
+					(
+					(C1 * (U[idx + grid.nyz] - U[idx]) - C2 * (U[idx + 2 * grid.nyz] - U[idx - grid.nyz])) *
+					(C1 * (U[idx + grid.nyz] - U[idx]) - C2 * (U[idx + 2 * grid.nyz] - U[idx - grid.nyz]))
+					+
+					(C1 * (U[idx] - U[idx - grid.nyz]) - C2 * (U[idx + grid.nyz] - U[idx - 2 * grid.nyz])) *
+					(C1 * (U[idx] - U[idx - grid.nyz]) - C2 * (U[idx + grid.nyz] - U[idx - 2 * grid.nyz]))
+					) * grid.dx2ih +
+
+					(
+					(C1 * (U[idx + grid.nz] - U[idx]) - C2 * (U[idx + 2 * grid.nz] - U[idx - grid.nz])) *
+					(C1 * (U[idx + grid.nz] - U[idx]) - C2 * (U[idx + 2 * grid.nz] - U[idx - grid.nz]))
+					+
+					(C1 * (U[idx] - U[idx - grid.nz]) - C2 * (U[idx + grid.nz] - U[idx - 2 * grid.nz])) *
+					(C1 * (U[idx] - U[idx - grid.nz]) - C2 * (U[idx + grid.nz] - U[idx - 2 * grid.nz]))
+					) * grid.dy2ih +
+
+					(T)0.5*(U[idx + 1] - U[idx]) * (U[idx + 1] - U[idx]) * grid.dzp2i[k] +
+					(T)0.5*(U[idx] - U[idx - 1]) * (U[idx] - U[idx - 1]) * grid.dzm2i[k]
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_iso_dissipation_x4(
+	T* _RESTRICT Vinterm, const T* _RESTRICT const V,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = c_viscosity * (
+					(
+					(C1 * (V[idx + grid.nyz] - V[idx]) - C2 * (V[idx + 2 * grid.nyz] - V[idx - grid.nyz])) *
+					(C1 * (V[idx + grid.nyz] - V[idx]) - C2 * (V[idx + 2 * grid.nyz] - V[idx - grid.nyz]))
+					+
+					(C1 * (V[idx] - V[idx - grid.nyz]) - C2 * (V[idx + grid.nyz] - V[idx - 2 * grid.nyz])) *
+					(C1 * (V[idx] - V[idx - grid.nyz]) - C2 * (V[idx + grid.nyz] - V[idx - 2 * grid.nyz]))
+					) * grid.dx2ih +
+
+					(
+					(C1 * (V[idx + grid.nz] - V[idx]) - C2 * (V[idx + 2 * grid.nz] - V[idx - grid.nz])) *
+					(C1 * (V[idx + grid.nz] - V[idx]) - C2 * (V[idx + 2 * grid.nz] - V[idx - grid.nz]))
+					+
+					(C1 * (V[idx] - V[idx - grid.nz]) - C2 * (V[idx + grid.nz] - V[idx - 2 * grid.nz])) *
+					(C1 * (V[idx] - V[idx - grid.nz]) - C2 * (V[idx + grid.nz] - V[idx - 2 * grid.nz]))
+					) * grid.dy2ih +
+
+					(T)0.5*(V[idx + 1] - V[idx]) * (V[idx + 1] - V[idx]) * grid.dzp2i[k] +
+					(T)0.5*(V[idx] - V[idx - 1]) * (V[idx] - V[idx - 1]) * grid.dzm2i[k]
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_iso_dissipation_x4(
+	T* _RESTRICT Winterm, const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = c_viscosity * (
+					(
+					(C1 * (W[idx + grid.nyz] - W[idx]) - C2 * (W[idx + 2 * grid.nyz] - W[idx - grid.nyz])) *
+					(C1 * (W[idx + grid.nyz] - W[idx]) - C2 * (W[idx + 2 * grid.nyz] - W[idx - grid.nyz]))
+					+
+					(C1 * (W[idx] - W[idx - grid.nyz]) - C2 * (W[idx + grid.nyz] - W[idx - 2 * grid.nyz])) *
+					(C1 * (W[idx] - W[idx - grid.nyz]) - C2 * (W[idx + grid.nyz] - W[idx - 2 * grid.nyz]))
+					) * grid.dx2ih +
+
+					(
+					(C1 * (W[idx + grid.nz] - W[idx]) - C2 * (W[idx + 2 * grid.nz] - W[idx - grid.nz])) *
+					(C1 * (W[idx + grid.nz] - W[idx]) - C2 * (W[idx + 2 * grid.nz] - W[idx - grid.nz]))
+					+
+					(C1 * (W[idx] - W[idx - grid.nz]) - C2 * (W[idx + grid.nz] - W[idx - 2 * grid.nz])) *
+					(C1 * (W[idx] - W[idx - grid.nz]) - C2 * (W[idx + grid.nz] - W[idx - 2 * grid.nz]))
+					) * grid.dy2ih +
+
+					(T)0.5*(W[idx + 1] - W[idx]) * (W[idx + 1] - W[idx]) * grid.dzm2i[k] +
+					(T)0.5*(W[idx] - W[idx - 1]) * (W[idx] - W[idx - 1]) * grid.dzp2i[k - 1]
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_iso_dissipation_x4(
+	T* _RESTRICT Xinterm, const T* _RESTRICT const X,
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = c_diffusivity * (
+					(
+					(C1 * (X[idx + grid.nyz] - X[idx]) - C2 * (X[idx + 2 * grid.nyz] - X[idx - grid.nyz])) *
+					(C1 * (X[idx + grid.nyz] - X[idx]) - C2 * (X[idx + 2 * grid.nyz] - X[idx - grid.nyz]))
+					+
+					(C1 * (X[idx] - X[idx - grid.nyz]) - C2 * (X[idx + grid.nyz] - X[idx - 2 * grid.nyz])) *
+					(C1 * (X[idx] - X[idx - grid.nyz]) - C2 * (X[idx + grid.nyz] - X[idx - 2 * grid.nyz]))
+					) * grid.dx2ih +
+
+					(
+					(C1 * (X[idx + grid.nz] - X[idx]) - C2 * (X[idx + 2 * grid.nz] - X[idx - grid.nz])) *
+					(C1 * (X[idx + grid.nz] - X[idx]) - C2 * (X[idx + 2 * grid.nz] - X[idx - grid.nz]))
+					+
+					(C1 * (X[idx] - X[idx - grid.nz]) - C2 * (X[idx + grid.nz] - X[idx - 2 * grid.nz])) *
+					(C1 * (X[idx] - X[idx - grid.nz]) - C2 * (X[idx + grid.nz] - X[idx - 2 * grid.nz]))
+					) * grid.dy2ih +
+
+					(T)0.5*(X[idx + 1] - X[idx]) * (X[idx + 1] - X[idx]) * grid.dzp2i[k] +
+					(T)0.5*(X[idx] - X[idx - 1]) * (X[idx] - X[idx - 1]) * grid.dzm2i[k]
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uv_iso_dissipation_components_x4(
+	T* _RESTRICT UVinterm_x,			// node: [W]
+	T* _RESTRICT UVinterm_y,			// node: [UVW]
+	T* _RESTRICT UVinterm_z,			// node: [U]
+
+	const T* _RESTRICT const U, const T* _RESTRICT const V,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UVinterm_x, UVinterm_y, UVinterm_z ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UVinterm_x, UVinterm_y, UVinterm_z )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+				UVinterm_x[idx] = (T)2.0 * c_viscosity * (
+					(
+					C1 * (
+					C1 * ((U[idx + grid.nyz] - U[idx]) + (U[idx + grid.nyz - grid.nz] - U[idx - grid.nz])) -
+					C2 * ((U[idx + 2 * grid.nyz] - U[idx - grid.nyz]) + (U[idx + 2 * grid.nyz - grid.nz] - U[idx - grid.nyz - grid.nz])))
+					- _3C2 * (
+					C1 * ((U[idx + grid.nyz + grid.nz] - U[idx + grid.nz]) + (U[idx + grid.nyz - 2 * grid.nz] - U[idx - 2 * grid.nz])) -
+					C2 * ((U[idx + 2 * grid.nyz + grid.nz] - U[idx - grid.nyz + grid.nz]) + (U[idx + 2 * grid.nyz - 2 * grid.nz] - U[idx - grid.nyz - 2 * grid.nz])))
+					) *
+					(
+					C1 * (
+					C1 * ((V[idx + grid.nyz] - V[idx]) + (V[idx] - V[idx - grid.nyz])) -
+					C2 * ((V[idx + 2 * grid.nyz] - V[idx - grid.nyz]) + (V[idx + grid.nyz] - V[idx - 2 * grid.nyz])))
+					- _3C2 * (
+					C1 * ((V[idx + 2 * grid.nyz] - V[idx + grid.nyz]) + (V[idx - grid.nyz] - V[idx - 2 * grid.nyz])) -
+					C2 * ((V[idx + 3 * grid.nyz] - V[idx]) + (V[idx] - V[idx - 3 * grid.nyz])))
+					) * grid.dx2iq);
+
+				UVinterm_y[idx] = (T)2.0 * c_viscosity * (
+					(
+					C1 * (
+					C1 * ((U[idx + grid.nz] - U[idx]) + (U[idx] - U[idx - grid.nz])) -
+					C2 * ((U[idx + 2 * grid.nz] - U[idx - grid.nz]) + (U[idx + grid.nz] - U[idx - 2 * grid.nz])))
+					- _3C2 * (
+					C1 * ((U[idx + 2 * grid.nz] - U[idx + grid.nz]) + (U[idx - grid.nz] - U[idx - 2 * grid.nz])) -
+					C2 * ((U[idx + 3 * grid.nz] - U[idx]) + (U[idx] - U[idx - 3 * grid.nz])))
+					) *
+					(
+					C1 * (
+					C1 * ((V[idx + grid.nz] - V[idx]) + (V[idx - grid.nyz + grid.nz] - V[idx - grid.nyz])) -
+					C2 * ((V[idx + 2 * grid.nz] - V[idx - grid.nz]) + (V[idx - grid.nyz + 2 * grid.nz] - V[idx - grid.nyz - grid.nz])))
+					- _3C2 * (
+					C1 * ((V[idx + grid.nyz + grid.nz] - V[idx + grid.nyz]) + (V[idx - 2 * grid.nyz + grid.nz] - V[idx - 2 * grid.nyz])) -
+					C2 * ((V[idx + grid.nyz + 2 * grid.nz] - V[idx + grid.nyz - grid.nz]) + (V[idx - 2 * grid.nyz + 2 * grid.nz] - V[idx - 2 * grid.nyz - grid.nz])))
+					) * grid.dy2iq);
+
+				UVinterm_z[idx] = (T)2.0 * c_viscosity * (
+					(C1 * ((U[idx] - U[idx - 1]) + (U[idx - grid.nz] - U[idx - grid.nz - 1])) 
+					-_3C2 * ((U[idx + grid.nz] - U[idx + grid.nz - 1]) + (U[idx - 2 * grid.nz] - U[idx - 2 * grid.nz - 1]))) *
+					
+					(C1 * ((V[idx] - V[idx - 1]) + (V[idx - grid.nyz] - V[idx - grid.nyz - 1]))
+					- _3C2 * ((V[idx + grid.nyz] - V[idx + grid.nyz - 1]) + (V[idx - 2 * grid.nyz] - V[idx - 2 * grid.nyz - 1])))
+					) * grid.dzmi[k] * grid.dzmi[k];
+			} 
+		}
+	}
+}
+
+template< typename T >
+void nse::uw_iso_dissipation_components_x4(
+	T* _RESTRICT UWinterm_x,			// node: [W]
+	T* _RESTRICT UWinterm_y,			// node: [UVW]
+	T* _RESTRICT UWinterm_z,			// node: [U]
+
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UWinterm_x, UWinterm_y, UWinterm_z ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UWinterm_x, UWinterm_y, UWinterm_z )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				UWinterm_x[idx] = (T)2.0 * c_viscosity * (
+					(
+					C1 * ((U[idx + grid.nyz] - U[idx]) + (U[idx + grid.nyz - 1] - U[idx - 1])) - 
+					C2 * ((U[idx + 2 * grid.nyz] - U[idx - grid.nyz]) + (U[idx + 2 * grid.nyz - 1] - U[idx - grid.nyz - 1]))
+					) *
+					(
+					C1 * (
+					C1 * ((W[idx + grid.nyz] - W[idx]) + (W[idx] - W[idx - grid.nyz])) -
+					C2 * ((W[idx + 2 * grid.nyz] - W[idx - grid.nyz]) + (W[idx + grid.nyz] - W[idx - 2 * grid.nyz])))
+					- _3C2 * (
+					C1 * ((W[idx + 2 * grid.nyz] - W[idx + grid.nyz]) + (W[idx - grid.nyz] - W[idx - 2 * grid.nyz])) -
+					C2 * ((W[idx + 3 * grid.nyz] - W[idx]) + (W[idx] - W[idx - 3 * grid.nyz])))
+					) * grid.dx2iq);
+
+				UWinterm_y[idx] = (T)2.0 * c_viscosity * (
+					(
+					C1 * ((U[idx] - U[idx - grid.nz]) + (U[idx - 1] - U[idx - grid.nz - 1])) -
+					C2 * ((U[idx + grid.nz] - U[idx - 2 * grid.nz]) + (U[idx + grid.nz - 1] - U[idx - 2 * grid.nz - 1]))
+					) *
+					(
+					C1 * (
+					C1 * ((W[idx] - W[idx - grid.nz]) + (W[idx - grid.nyz] - W[idx - grid.nyz - grid.nz])) - 
+					C2 * ((W[idx + grid.nz] - W[idx - 2 * grid.nz]) + (W[idx - grid.nyz + grid.nz] - W[idx - grid.nyz - 2 * grid.nz])))
+					- _3C2 * (
+					C1 * ((W[idx + grid.nyz] - W[idx + grid.nyz - grid.nz]) + (W[idx - 2 * grid.nyz] - W[idx - 2 * grid.nyz - grid.nz])) -
+					C2 * ((W[idx + grid.nyz + grid.nz] - W[idx + grid.nyz - 2 * grid.nz]) + (W[idx - 2 * grid.nyz + grid.nz] - W[idx - 2 * grid.nyz - 2 * grid.nz])))
+					) * grid.dy2iq);
+
+				UWinterm_z[idx] = (T)2.0 * c_viscosity * (
+					((U[idx + 1] - U[idx]) * grid.dzpi[k] + (U[idx] - U[idx - 1]) * grid.dzmi[k]) *
+					(
+					C1 * ((W[idx + 1] - W[idx]) + (W[idx - grid.nyz + 1] - W[idx - grid.nyz])) -
+					_3C2 * ((W[idx + grid.nyz + 1] - W[idx + grid.nyz]) + (W[idx - 2 * grid.nyz + 1] - W[idx - 2 * grid.nyz]))
+					) * grid.dzih[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::vw_iso_dissipation_components_x4(
+	T* _RESTRICT VWinterm_x,			// node: [UVW]
+	T* _RESTRICT VWinterm_y,			// node: [W]
+	T* _RESTRICT VWinterm_z,			// node: [V]
+
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VWinterm_x, VWinterm_y, VWinterm_z ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VWinterm_x, VWinterm_y, VWinterm_z )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				VWinterm_x[idx] = (T)2.0 * c_viscosity * (
+					(
+					C1 * ((V[idx] - V[idx - grid.nyz]) + (V[idx - 1] - V[idx - grid.nyz - 1])) -
+					C2 * ((V[idx + grid.nyz] - V[idx - 2 * grid.nyz]) + (V[idx + grid.nyz - 1] - V[idx - 2 * grid.nyz - 1]))
+					) *
+					(
+					C1 * (
+					C1 * ((W[idx] - W[idx - grid.nyz]) + (W[idx - grid.nz] - W[idx - grid.nyz - grid.nz])) -
+					C2 * ((W[idx + grid.nyz] - W[idx - 2 * grid.nyz]) + (W[idx + grid.nyz - grid.nz] - W[idx - 2 * grid.nyz - grid.nz])))
+					- _3C2 * (
+					C1 * ((W[idx + grid.nz] - W[idx - grid.nyz + grid.nz]) + (W[idx - 2 * grid.nz] - W[idx - grid.nyz - 2 * grid.nz])) -
+					C2 * ((W[idx + grid.nyz + grid.nz] - W[idx - 2 * grid.nyz + grid.nz]) + (W[idx + grid.nyz - 2 * grid.nz] - W[idx - 2 * grid.nyz - 2 * grid.nz])))
+					) * grid.dx2iq);
+
+				VWinterm_y[idx] = (T)2.0 * c_viscosity * (
+					(
+					C1 * ((V[idx + grid.nz] - V[idx]) + (V[idx + grid.nz - 1] - V[idx - 1])) -
+					C2 * ((V[idx + 2 * grid.nz] - V[idx - grid.nz]) + (V[idx + 2 * grid.nz - 1] - V[idx - grid.nz - 1]))
+					) *
+					(
+					C1 * (
+					C1 * ((W[idx + grid.nz] - W[idx]) + (W[idx] - W[idx - grid.nz])) -
+					C2 * ((W[idx + 2 * grid.nz] - W[idx - grid.nz]) + (W[idx + grid.nz] - W[idx - 2 * grid.nz])))
+					- _3C2 * (
+					C1 * ((W[idx + 2 * grid.nz] - W[idx + grid.nz]) + (W[idx - grid.nz] - W[idx - 2 * grid.nz])) -
+					C2 * ((W[idx + 3 * grid.nz] - W[idx]) + (W[idx] - W[idx - 3 * grid.nz])))
+					) * grid.dy2iq);
+
+				VWinterm_z[idx] = (T)2.0 * c_viscosity * (
+					((V[idx + 1] - V[idx]) * grid.dzpi[k] + (V[idx] - V[idx - 1]) * grid.dzmi[k]) *
+					(
+					C1 * ((W[idx + 1] - W[idx]) + (W[idx - grid.nz + 1] - W[idx - grid.nz])) -
+					_3C2 * ((W[idx + grid.nz + 1] - W[idx + grid.nz]) + (W[idx - 2 * grid.nz + 1] - W[idx - 2 * grid.nz]))
+					) * grid.dzih[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uv_iso_dissipation_x4(
+	T* _RESTRICT UVinterm,					// node: [UV]
+
+	const T* _RESTRICT const UVinterm_x,	// node: [V]
+	const T* _RESTRICT const UVinterm_y,	// node: [U]
+	const T* _RESTRICT const UVinterm_z,	// node: [UVW]
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UVinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UVinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				UVinterm[idx] =
+					(T)0.5 * (C1 * (UVinterm_x[idx] + UVinterm_x[idx - grid.nyz]) - _3C2 * (UVinterm_x[idx + grid.nyz] + UVinterm_x[idx - 2 * grid.nyz])) +
+					(T)0.5 * (C1 * (UVinterm_y[idx] + UVinterm_y[idx - grid.nz]) - _3C2 * (UVinterm_y[idx + grid.nz] + UVinterm_y[idx - 2 * grid.nz])) +
+					(T)0.5 * (UVinterm_z[idx] + UVinterm_z[idx + 1]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uw_iso_dissipation_x4(
+	T* _RESTRICT UWinterm,					// node: [UW]
+
+	const T* _RESTRICT const UWinterm_x,	// node: [W]
+	const T* _RESTRICT const UWinterm_y,	// node: [UVW]
+	const T* _RESTRICT const UWinterm_z,	// node: [U]
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UWinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UWinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				UWinterm[idx] =
+					(T)0.5 * (C1 * (UWinterm_x[idx] + UWinterm_x[idx - grid.nyz]) - _3C2 * (UWinterm_x[idx + grid.nyz] + UWinterm_x[idx - 2 * grid.nyz])) +
+					(T)0.5 * (C1 * (UWinterm_y[idx] + UWinterm_y[idx + grid.nz]) - _3C2 * (UWinterm_y[idx - grid.nz] + UWinterm_y[idx + 2 * grid.nz])) +
+					(T)0.5 * (UWinterm_z[idx] + UWinterm_z[idx - 1]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::vw_iso_dissipation_x4(
+	T* _RESTRICT VWinterm,					// node: [VW]
+
+	const T* _RESTRICT const VWinterm_x,	// node: [UVW]
+	const T* _RESTRICT const VWinterm_y,	// node: [W]
+	const T* _RESTRICT const VWinterm_z,	// node: [V]
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VWinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VWinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				VWinterm[idx] =
+					(T)0.5 * (C1 * (VWinterm_x[idx] + VWinterm_x[idx + grid.nyz]) - _3C2 * (VWinterm_x[idx - grid.nyz] + VWinterm_x[idx + 2 * grid.nyz])) +
+					(T)0.5 * (C1 * (VWinterm_y[idx] + VWinterm_y[idx - grid.nz]) - _3C2 * (VWinterm_y[idx + grid.nz] + VWinterm_y[idx - 2 * grid.nz])) +
+					(T)0.5 * (VWinterm_z[idx] + VWinterm_z[idx - 1]);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * divergence * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::divergence_x4(
+	T* _RESTRICT Div,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Div ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Div )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Div[idx] =
+					(C1 * (U[idx + grid.nyz] - U[idx]) - C2 * (U[idx + 2 * grid.nyz] - U[idx - grid.nyz])) * grid.dxi +
+					(C1 * (V[idx + grid.nz] - V[idx]) - C2 * (V[idx + 2 * grid.nz] - V[idx - grid.nz])) * grid.dyi +
+
+					(W[idx + 1] - W[idx]) * grid.dzi[k];
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * gradient * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_sub_gradient_x4(
+	T* _RESTRICT Uinterm, const T* _RESTRICT const X,
+	const T c_gradient, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	const T c_grad_x = c_gradient * grid.dxi;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] -= c_grad_x * (
+					C1 * (X[idx] - X[idx - grid.nyz]) - C2 * (X[idx + grid.nyz] - X[idx - 2 * grid.nyz]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_sub_gradient_x4(
+	T* _RESTRICT Vinterm, const T* _RESTRICT const X,
+	const T c_gradient, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	const T c_grad_y = c_gradient * grid.dyi;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] -= c_grad_y * (
+					C1 * (X[idx] - X[idx - grid.nz]) - C2 * (X[idx + grid.nz] - X[idx - 2 * grid.nz]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_sub_gradient_x4(
+	T* _RESTRICT Winterm, const T* _RESTRICT const X,
+	const T c_gradient, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	const T c_grad_z = (T)2.0 * c_gradient;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] -= c_grad_z *
+					(X[idx] - X[idx - 1]) * grid.dzmi[k];
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * poisson equation rhs * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::poisson_rhs_x4(
+	T* _RESTRICT Rhs,
+	const T* _RESTRICT const Div,
+	const T* _RESTRICT const Uinterm, const T* _RESTRICT const Vinterm, const T* _RESTRICT const Winterm,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	const T idt = (T) 1.0 / dt;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Rhs ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Rhs )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Rhs[idx] = Div[idx] * idt +
+
+					(C1 * (Uinterm[idx + grid.nyz] - Uinterm[idx]) - C2 * (Uinterm[idx + 2 * grid.nyz] - Uinterm[idx - grid.nyz])) * grid.dxi +
+					(C1 * (Vinterm[idx + grid.nz] - Vinterm[idx]) - C2 * (Vinterm[idx + 2 * grid.nz] - Vinterm[idx - grid.nz])) * grid.dyi +
+					(Winterm[idx + 1] - Winterm[idx]) * grid.dzi[k];
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * velocity projection * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_projection_x4(
+	T* _RESTRICT U, const T* _RESTRICT const Uinterm, const T* _RESTRICT const Phi,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( U ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( U )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				U[idx] += dt * (Uinterm[idx] -
+					(C1 * (Phi[idx] - Phi[idx - grid.nyz]) -
+					C2 * (Phi[idx + grid.nyz] - Phi[idx - 2 * grid.nyz])) * grid.dxi);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_projection_x4(
+	T* _RESTRICT V, const T* _RESTRICT const Vinterm, const T* _RESTRICT const Phi,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( V ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( V )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				V[idx] += dt * (Vinterm[idx] -
+					(C1 * (Phi[idx] - Phi[idx - grid.nz]) -
+					C2 * (Phi[idx + grid.nz] - Phi[idx - 2 * grid.nz])) * grid.dyi);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_projection_x4(
+	T* _RESTRICT W, const T* _RESTRICT const Winterm, const T* _RESTRICT const Phi,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				W[idx] += dt * (Winterm[idx] -
+					(Phi[idx] - Phi[idx - 1]) * (T) 2.0 * grid.dzmi[k]);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * buoyancy * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_buoyancy_x4(
+	T* _RESTRICT Uinterm, const T* _RESTRICT const X,
+	const T c_gravity_x, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 8.0;
+	const T c_gx = (T) 0.5 * c_gravity_x;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] += c_gx * (
+					C1 * (X[idx] + X[idx - grid.nyz]) -
+					C2 * (X[idx + grid.nyz] + X[idx - 2 * grid.nyz]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_buoyancy_x4(
+	T* _RESTRICT Vinterm, const T* _RESTRICT const X,
+	const T c_gravity_y, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 8.0;
+	const T c_gy = (T) 0.5 * c_gravity_y;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] += c_gy * (
+					C1 * (X[idx] + X[idx - grid.nz]) -
+					C2 * (X[idx + grid.nz] + X[idx - 2 * grid.nz]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_buoyancy_x4(
+	T* _RESTRICT Winterm, const T* _RESTRICT const X,
+	const T c_gravity_z, const wstGrid3d< T >& grid)
+{
+	const T c_gz = (T)0.5 * c_gravity_z;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] += c_gz *
+					// * linear interpolation -> //
+					//(T)2.0 * (X[idx] * grid.dz[k - 1] + X[idx - 1] * grid.dz[k]) * grid.dzmi[k];
+
+					// * averaging in computational space -> //
+					(X[idx] + X[idx - 1]);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * coriolis * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_coriolis_x4(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] += c_coriolis_z * (
+					C1 * (
+					C1 * (V[idx] + V[idx - grid.nyz]
+					+ V[idx + grid.nz] + V[idx - grid.nyz + grid.nz]) -
+					C2 * (V[idx + grid.nyz] + V[idx - 2 * grid.nyz]
+					+ V[idx + grid.nyz + grid.nz] - V[idx - 2 * grid.nyz + grid.nz])
+					) -
+					C2 * (
+					C1 * (V[idx - grid.nz] + V[idx - grid.nyz - grid.nz]
+					+ V[idx + 2 * grid.nz] + V[idx - grid.nyz + 2 * grid.nz]) -
+					C2 * (V[idx + grid.nyz - grid.nz] + V[idx - 2 * grid.nyz - grid.nz]
+					+ V[idx + grid.nyz + 2 * grid.nz] - V[idx - 2 * grid.nyz + 2 * grid.nz])
+					))
+					-
+					c_coriolis_y * (
+					C1 * (
+					C1 * (W[idx] + W[idx - grid.nyz]
+					+ W[idx + 1] + W[idx - grid.nyz + 1]) -
+					C2 * (W[idx + grid.nyz] + W[idx - 2 * grid.nyz]
+					+ W[idx + grid.nyz + 1] - W[idx - 2 * grid.nyz + 1])
+					) -
+					C2 * (
+					C1 * (W[idx - 1] + W[idx - grid.nyz - 1]
+					+ W[idx + 2] + W[idx - grid.nyz + 2]) -
+					C2 * (W[idx + grid.nyz - 1] + W[idx - 2 * grid.nyz - 1]
+					+ W[idx + grid.nyz + 2] - W[idx - 2 * grid.nyz + 2])
+					));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_coriolis_x4(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] -= c_coriolis_z * (
+					C1 * (
+					C1 * (U[idx] + U[idx - grid.nz]
+					+ U[idx + grid.nyz] + U[idx - grid.nz + grid.nyz]) -
+					C2 * (U[idx + grid.nz] + U[idx - 2 * grid.nz]
+					+ U[idx + grid.nz + grid.nyz] - U[idx - 2 * grid.nz + grid.nyz])
+					) -
+					C2 * (
+					C1 * (U[idx - grid.nyz] + U[idx - grid.nz - grid.nyz]
+					+ U[idx + 2 * grid.nyz] + U[idx - grid.nz + 2 * grid.nyz]) -
+					C2 * (U[idx + grid.nz - grid.nyz] + U[idx - 2 * grid.nz - grid.nyz]
+					+ U[idx + grid.nz + 2 * grid.nyz] - U[idx - 2 * grid.nz + 2 * grid.nyz])
+					))
+					-
+					c_coriolis_x * (
+					C1 * (
+					C1 * (W[idx] + W[idx - grid.nz]
+					+ W[idx + 1] + W[idx - grid.nz + 1]) -
+					C2 * (W[idx + grid.nz] + W[idx - 2 * grid.nz]
+					+ W[idx + grid.nz + 1] - W[idx - 2 * grid.nz + 1])
+					) -
+					C2 * (
+					C1 * (W[idx - 1] + W[idx - grid.nz - 1]
+					+ W[idx + 2] + W[idx - grid.nz + 2]) -
+					C2 * (W[idx + grid.nz - 1] + W[idx - 2 * grid.nz - 1]
+					+ W[idx + grid.nz + 2] - W[idx - 2 * grid.nz + 2])
+					));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_coriolis_x4(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] += c_coriolis_y * (
+					C1 * (
+					C1 * (U[idx] + U[idx - 1]
+					+ U[idx + grid.nyz] + U[idx - 1 + grid.nyz]) -
+					C2 * (U[idx + 1] + U[idx - 2]
+					+ U[idx + 1 + grid.nyz] - U[idx - 2 + grid.nyz])
+					) -
+					C2 * (
+					C1 * (U[idx - grid.nyz] + U[idx - 1 - grid.nyz]
+					+ U[idx + 2 * grid.nyz] + U[idx - 1 + 2 * grid.nyz]) -
+					C2 * (U[idx + 1 - grid.nyz] + U[idx - 2 - grid.nyz]
+					+ U[idx + 1 + 2 * grid.nyz] - U[idx - 2 + 2 * grid.nyz])
+					))
+					-
+					c_coriolis_x * (
+					C1 * (
+					C1 * (V[idx] + V[idx - 1]
+					+ V[idx + grid.nz] + V[idx - 1 + grid.nz]) -
+					C2 * (V[idx + 1] + V[idx - 2]
+					+ V[idx + 1 + grid.nz] - V[idx - 2 + grid.nz])
+					) -
+					C2 * (
+					C1 * (V[idx - grid.nz] + V[idx - 1 - grid.nz]
+					+ V[idx + 2 * grid.nz] + V[idx - 1 + 2 * grid.nz]) -
+					C2 * (V[idx + 1 - grid.nz] + V[idx - 2 - grid.nz]
+					+ V[idx + 1 + 2 * grid.nz] - V[idx - 2 + 2 * grid.nz])
+					));
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * kinetic energy * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+T nse::kinetic_energy_x4(
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	T ke_sum = (T)0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) reduction( + : ke_sum ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) reduction( + : ke_sum )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+				ke_sum +=
+					(
+					C1 * (U[idx] * U[idx] + U[idx + grid.nyz] * U[idx + grid.nyz]) +
+					C1 * (V[idx] * V[idx] + V[idx + grid.nz] * V[idx + grid.nz]) +
+					(W[idx] * W[idx] + W[idx + 1] * W[idx + 1])
+					-
+					C2 * (U[idx - grid.nyz] * U[idx - grid.nyz] + U[idx + 2 * grid.nyz] * U[idx + 2 * grid.nyz]) -
+					C2 * (V[idx - grid.nz] * V[idx - grid.nz] + V[idx + 2 * grid.nz] * V[idx + 2 * grid.nz])
+					) * grid.dz[k];
+			}
+		}
+	}
+
+	mpi_allreduce(&ke_sum, MPI_SUM, grid.mpi_com.comm);
+	return (T) 0.25 * ke_sum * grid.dx * grid.dy;
+}
+// ------------------------------------------------------------------------ //
+
+// * vorticity * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::vorticity_x4(T* _RESTRICT Vorticity_x, T* _RESTRICT Vorticity_y, T* _RESTRICT Vorticity_z,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) \
+	shared(Vorticity_x, Vorticity_y, Vorticity_z) collapse(2)
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) \
+	shared(Vorticity_x, Vorticity_y, Vorticity_z)
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vorticity_x[idx] =
+					(
+					C1 * (W[idx] - W[idx - grid.nz]) -
+					C2 * (W[idx + grid.nz] - W[idx - 2 * grid.nz])
+					) * grid.dyi -
+					(V[idx] - V[idx - 1]) * (T)2.0 * grid.dzmi[k];
+
+				Vorticity_y[idx] =
+					(U[idx] - U[idx - 1]) * (T)2.0 * grid.dzmi[k] -
+					(
+					C1 * (W[idx] - W[idx - grid.nyz]) -
+					C2 * (W[idx + grid.nyz] - W[idx - 2 * grid.nyz])
+					) * grid.dxi;
+
+				Vorticity_z[idx] =
+					(
+					C1 * (V[idx] - V[idx - grid.nyz]) -
+					C2 * (V[idx + grid.nyz] - V[idx - 2 * grid.nyz])
+					) * grid.dxi -
+					(
+					C1 * (U[idx] - U[idx - grid.nz]) -
+					C2 * (U[idx + grid.nz] - U[idx - 2 * grid.nz])
+					) * grid.dyi;
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * special field products * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// = W * (dU/dz) [node: UW]
+void nse::uw_advection_x4(T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUW) {	// 0.5 * (W[i(+1)jk] + W[i-1(-2)jk]) * (U[ijk] - U[ijk-1]) / dz
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Xinterm[idx] =
+						(C1 * (W[idx - grid.nyz] + W[idx]) -
+						_3C2 * (W[idx - 2 * grid.nyz] + W[idx + grid.nyz])) *
+						(U[idx] - U[idx - 1]) * grid.dzih[k];
+
+					// approximation is based on ADV. scheme
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = W * (dV/dz) [node: VW]
+void nse::vw_advection_x4(T* _RESTRICT Xinterm,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeVW) {	// 0.5 * (W[ij(+1)k] + W[ij-1(-2)k]) * (V[ijk] - V[ijk-1]) / dz
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Xinterm[idx] =
+						(C1 * (W[idx - grid.nz] + W[idx]) -
+						_3C2 * (W[idx - 2 * grid.nz] + W[idx + grid.nz])) *
+						(V[idx] - V[idx - 1]) * grid.dzih[k];
+
+					// approximation is based on ADV. scheme
+				}
+			}
+		}
+		return;
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * scalar-pressure gradient * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// [ C*dP/dx ] [-> node: U]
+void nse::c_u_pressure_gradient_x4(T* _RESTRICT C_dPdx,
+	const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C_dPdx ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C_dPdx )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				C_dPdx[idx] = (C1 * (X[idx] + X[idx - grid.nyz]) -
+					_3C2 * (X[idx + grid.nyz] + X[idx - 2 * grid.nyz])) *
+					((C1 * (Pressure[idx] - Pressure[idx - grid.nyz]) -
+					C2 * (Pressure[idx + grid.nyz] - Pressure[idx - 2 * grid.nyz])) * grid.dxih);
+			}
+		}
+	}
+}
+
+template< typename T >	// [ C*dP/dy ] [-> node: V]
+void nse::c_v_pressure_gradient_x4(T* _RESTRICT C_dPdy,
+	const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C_dPdy ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C_dPdy )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				C_dPdy[idx] = (C1 * (X[idx] + X[idx - grid.nz]) -
+					_3C2 * (X[idx + grid.nz] + X[idx - 2 * grid.nz])) *
+					((C1 * (Pressure[idx] - Pressure[idx - grid.nz]) -
+					C2 * (Pressure[idx + grid.nz] - Pressure[idx - 2 * grid.nz])) * grid.dyih);
+			}
+		}
+	}
+}
+
+template< typename T >	// [ C*dP/dz ] [-> node: W]
+void nse::c_w_pressure_gradient_x4(T* _RESTRICT C_dPdz,
+	const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C_dPdz ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C_dPdz )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				C_dPdz[idx] = (X[idx] + X[idx - 1]) *
+					((Pressure[idx] - Pressure[idx - 1]) * grid.dzmi[k]);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * pressure-strain tensor * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// [ P*(dU/dx), P*(dV/dy), P*(dW/dz) ] [-> node: C]
+void nse::pressure_strain_diag_x4(T* _RESTRICT PU, T* _RESTRICT PV, T* _RESTRICT PW,
+	const T* _RESTRICT const Pressure,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// Pressure[ijk] * U-Div[ijk]
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( PU, PV, PW ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( PU, PV, PW )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				PU[idx] = Pressure[idx] * (
+					(C1 * (U[idx + grid.nyz] - U[idx]) - C2 * (U[idx + 2 * grid.nyz] - U[idx - grid.nyz])) * grid.dxi);
+				PV[idx] = Pressure[idx] * (
+					(C1 * (V[idx + grid.nz] - V[idx]) - C2 * (V[idx + 2 * grid.nz] - V[idx - grid.nz])) * grid.dyi);
+				PW[idx] = Pressure[idx] * (
+					(W[idx + 1] - W[idx]) * grid.dzi[k]);
+			}
+		}
+	}
+}
+
+template< typename T >	// = P * 2 * S[u,v] = P * (du/dy + dv/dx) [node: UV]
+void nse::pressure_strain_uv_x4(T* _RESTRICT PS_UV,
+	const T* _RESTRICT const Pressure,
+	const T* _RESTRICT const U, const T* _RESTRICT const V,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-x,-y] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( PS_UV ) collapse( 2 )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( PS_UV )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				PS_UV[idx] = (
+					C1 * (C1 * (
+					Pressure[idx] + Pressure[idx - grid.nyz] +
+					Pressure[idx - grid.nz] + Pressure[idx - grid.nyz - grid.nz]) -
+					_3C2 * (
+					Pressure[idx + grid.nyz] + Pressure[idx - 2 * grid.nyz] +
+					Pressure[idx + grid.nyz - grid.nz] + Pressure[idx - 2 * grid.nyz - grid.nz])) -
+					_3C2 * (C1 * (
+					Pressure[idx + grid.nz] + Pressure[idx - grid.nyz + grid.nz] +
+					Pressure[idx - 2 * grid.nz] + Pressure[idx - grid.nyz - 2 * grid.nz]) -
+					_3C2 * (
+					Pressure[idx + grid.nyz + grid.nz] + Pressure[idx - 2 * grid.nyz + grid.nz] +
+					Pressure[idx + grid.nyz - 2 * grid.nz] + Pressure[idx - 2 * grid.nyz - 2 * grid.nz]))
+					) *
+					(
+					// uv: p*du/dy //
+					(C1 * (U[idx] - U[idx - grid.nz]) - C2 * (U[idx + grid.nz] - U[idx - 2 * grid.nz])) * grid.dyiq +
+					// vu: p*dv/dx //
+					(C1 * (V[idx] - V[idx - grid.nyz]) - C2 * (V[idx + grid.nyz] - V[idx - 2 * grid.nyz])) * grid.dxiq
+					);
+			}
+		}
+	}
+}
+
+template< typename T >	// = P * 2 * S[u,w] = P * (du/dz + dw/dx) [node: UW]
+void nse::pressure_strain_uw_x4(T* _RESTRICT PS_UW,
+	const T* _RESTRICT const Pressure,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-x,-z] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( PS_UW ) collapse( 2 )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( PS_UW )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+				PS_UW[idx] = (
+					C1 * (
+					Pressure[idx] + Pressure[idx - grid.nyz] +
+					Pressure[idx - 1] + Pressure[idx - grid.nyz - 1]) -
+					_3C2 * (
+					Pressure[idx + grid.nyz] + Pressure[idx - 2 * grid.nyz] +
+					Pressure[idx + grid.nyz - 1] + Pressure[idx - 2 * grid.nyz - 1])
+					) *
+					(
+					// uw: p*du/dz //
+					(U[idx] - U[idx - 1]) * grid.dzmih[k] +
+					// wu: p*dw/dx //
+					(C1 * (W[idx] - W[idx - grid.nyz]) - C2 * (W[idx + grid.nyz] - W[idx - 2 * grid.nyz])) * grid.dxiq
+					);
+			}
+		}
+	}
+}
+
+template< typename T >	// = P * 2 * S[v,w] = P * (dv/dz + dw/dy) [node: VW]
+void nse::pressure_strain_vw_x4(T* _RESTRICT PS_VW,
+	const T* _RESTRICT const Pressure,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0,
+		_3C2 = (T) 1.0 / (T) 8.0;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-y,-z] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( PS_VW ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( PS_VW )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+				PS_VW[idx] = (
+					C1 * (
+					Pressure[idx] + Pressure[idx - grid.nz] +
+					Pressure[idx - 1] + Pressure[idx - grid.nz - 1]) -
+					_3C2 * (
+					Pressure[idx + grid.nz] + Pressure[idx - 2 * grid.nz] +
+					Pressure[idx + grid.nz - 1] + Pressure[idx - 2 * grid.nz - 1])
+					) *
+					(
+					// vw: p*dv/dz //
+					(V[idx] - V[idx - 1]) * grid.dzmih[k] +
+					// wv: p*dw/dy //
+					(C1 * (W[idx] - W[idx - grid.nz]) - C2 * (W[idx + grid.nz] - W[idx - 2 * grid.nz])) * grid.dyiq
+					);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+
+// * initialize: advection * //
+template void nse::u_advection_div_x4(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::u_advection_div_x4(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::v_advection_div_x4(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::v_advection_div_x4(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::w_advection_div_x4(float* _RESTRICT Winterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::w_advection_div_x4(double* _RESTRICT Winterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::u_advection_skew_x4(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::u_advection_skew_x4(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::v_advection_skew_x4(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::v_advection_skew_x4(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::w_advection_skew_x4(float* _RESTRICT Winterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::w_advection_skew_x4(double* _RESTRICT Winterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: scalar advection * //
+template void nse::c_advection_div_x4(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float* _RESTRICT const X,
+	const wstGrid3d< float >& grid);
+template void nse::c_advection_div_x4(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double* _RESTRICT const X,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_advection_skew_x4(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float* _RESTRICT const X,
+	const wstGrid3d< float >& grid);
+template void nse::c_advection_skew_x4(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double* _RESTRICT const X,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_advection_div_vline_x4(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float X0, const float XH,
+	const wstGrid3d< float >& grid);
+template void nse::c_advection_div_vline_x4(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double X0, const double XH,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_advection_skew_vline_x4(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float X0, const float XH,
+	const wstGrid3d< float >& grid);
+template void nse::c_advection_skew_vline_x4(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double X0, const double XH,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: diffusion * //
+template void nse::u_add_diffusion_x4(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::u_add_diffusion_x4(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::v_add_diffusion_x4(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const V,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::v_add_diffusion_x4(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const V,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::w_add_diffusion_x4(float* _RESTRICT Winterm,
+	const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::w_add_diffusion_x4(double* _RESTRICT Winterm,
+	const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::c_add_diffusion_x4(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const X,
+	const float c_diffusivity, const wstGrid3d< float >& grid);
+template void nse::c_add_diffusion_x4(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const X,
+	const double c_diffusivity, const wstGrid3d< double >& grid);
+
+
+template void nse::u_set_diffusion_x4(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::u_set_diffusion_x4(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::v_set_diffusion_x4(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const V,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::v_set_diffusion_x4(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const V,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::w_set_diffusion_x4(float* _RESTRICT Winterm,
+	const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::w_set_diffusion_x4(double* _RESTRICT Winterm,
+	const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::c_set_diffusion_x4(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const X,
+	const float c_diffusivity, const wstGrid3d< float >& grid);
+template void nse::c_set_diffusion_x4(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const X,
+	const double c_diffusivity, const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: dissipation operator * //
+template void nse::u_dissipation_x4(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::u_dissipation_x4(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::v_dissipation_x4(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const V,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::v_dissipation_x4(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const V,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::w_dissipation_x4(float* _RESTRICT Winterm,
+	const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::w_dissipation_x4(double* _RESTRICT Winterm,
+	const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::c_dissipation_x4(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const X,
+	const float c_diffusivity, const wstGrid3d< float >& grid);
+template void nse::c_dissipation_x4(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const X,
+	const double c_diffusivity, const wstGrid3d< double >& grid);
+
+template void nse::uw_dissipation_x4(float* _RESTRICT UWinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const float* _RESTRICT const U_diffusion,
+	const float* _RESTRICT const W_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::uw_dissipation_x4(double* _RESTRICT UWinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const double* _RESTRICT const U_diffusion,
+	const double* _RESTRICT const W_diffusion,
+	const wstGrid3d< double >& grid);
+
+template void nse::vw_dissipation_x4(float* _RESTRICT VWinterm,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float* _RESTRICT const V_diffusion,
+	const float* _RESTRICT const W_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::vw_dissipation_x4(double* _RESTRICT VWinterm,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double* _RESTRICT const V_diffusion,
+	const double* _RESTRICT const W_diffusion,
+	const wstGrid3d< double >& grid);
+
+template void nse::uv_dissipation_x4(float* _RESTRICT UVinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V,
+	const float* _RESTRICT const U_diffusion,
+	const float* _RESTRICT const V_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::uv_dissipation_x4(double* _RESTRICT UVinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V,
+	const double* _RESTRICT const U_diffusion,
+	const double* _RESTRICT const V_diffusion,
+	const wstGrid3d< double >& grid);
+
+template void nse::cu_dissipation_x4(float* _RESTRICT CUinterm,
+	const float* _RESTRICT const X, const float* _RESTRICT const U,
+	const float* _RESTRICT const X_diffusion,
+	const float* _RESTRICT const U_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::cu_dissipation_x4(double* _RESTRICT CUinterm,
+	const double* _RESTRICT const X, const double* _RESTRICT const U,
+	const double* _RESTRICT const X_diffusion,
+	const double* _RESTRICT const U_diffusion,
+	const wstGrid3d< double >& grid);
+
+template void nse::cv_dissipation_x4(float* _RESTRICT CVinterm,
+	const float* _RESTRICT const X, const float* _RESTRICT const V,
+	const float* _RESTRICT const X_diffusion,
+	const float* _RESTRICT const V_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::cv_dissipation_x4(double* _RESTRICT CVinterm,
+	const double* _RESTRICT const X, const double* _RESTRICT const V,
+	const double* _RESTRICT const X_diffusion,
+	const double* _RESTRICT const V_diffusion,
+	const wstGrid3d< double >& grid);
+
+template void nse::cw_dissipation_x4(float* _RESTRICT CWinterm,
+	const float* _RESTRICT const X, const float* _RESTRICT const W,
+	const float* _RESTRICT const X_diffusion,
+	const float* _RESTRICT const W_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::cw_dissipation_x4(double* _RESTRICT CWinterm,
+	const double* _RESTRICT const X, const double* _RESTRICT const W,
+	const double* _RESTRICT const X_diffusion,
+	const double* _RESTRICT const W_diffusion,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: isotropic dissipation operator * //
+template void nse::u_iso_dissipation_x4(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::u_iso_dissipation_x4(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::v_iso_dissipation_x4(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const V,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::v_iso_dissipation_x4(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const V,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::w_iso_dissipation_x4(float* _RESTRICT Winterm,
+	const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::w_iso_dissipation_x4(double* _RESTRICT Winterm,
+	const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::c_iso_dissipation_x4(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const X,
+	const float c_diffusivity, const wstGrid3d< float >& grid);
+template void nse::c_iso_dissipation_x4(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const X,
+	const double c_diffusivity, const wstGrid3d< double >& grid);
+
+template void nse::uv_iso_dissipation_components_x4(
+	float* _RESTRICT UVinterm_x, float* _RESTRICT UVinterm_y, float* _RESTRICT UVinterm_z,
+	const float* _RESTRICT const U, const float* _RESTRICT const V,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::uv_iso_dissipation_components_x4(
+	double* _RESTRICT UVinterm_x, double* _RESTRICT UVinterm_y, double* _RESTRICT UVinterm_z,
+	const double* _RESTRICT const U, const double* _RESTRICT const V,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::uw_iso_dissipation_components_x4(
+	float* _RESTRICT UWinterm_x, float* _RESTRICT UWinterm_y, float* _RESTRICT UWinterm_z,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::uw_iso_dissipation_components_x4(
+	double* _RESTRICT UWinterm_x, double* _RESTRICT UWinterm_y, double* _RESTRICT UWinterm_z,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::vw_iso_dissipation_components_x4(
+	float* _RESTRICT VWinterm_x, float* _RESTRICT VWinterm_y, float* _RESTRICT VWinterm_z,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::vw_iso_dissipation_components_x4(
+	double* _RESTRICT VWinterm_x, double* _RESTRICT VWinterm_y, double* _RESTRICT VWinterm_z,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::uv_iso_dissipation_x4(float* _RESTRICT UVinterm,
+	const float* _RESTRICT const UVinterm_x,
+	const float* _RESTRICT const UVinterm_y,
+	const float* _RESTRICT const UVinterm_z,
+	const wstGrid3d< float >& grid);
+template void nse::uv_iso_dissipation_x4(double* _RESTRICT UVinterm,
+	const double* _RESTRICT const UVinterm_x,
+	const double* _RESTRICT const UVinterm_y,
+	const double* _RESTRICT const UVinterm_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::uw_iso_dissipation_x4(float* _RESTRICT UWinterm,
+	const float* _RESTRICT const UWinterm_x,
+	const float* _RESTRICT const UWinterm_y,
+	const float* _RESTRICT const UWinterm_z,
+	const wstGrid3d< float >& grid);
+template void nse::uw_iso_dissipation_x4(double* _RESTRICT UWinterm,
+	const double* _RESTRICT const UWinterm_x,
+	const double* _RESTRICT const UWinterm_y,
+	const double* _RESTRICT const UWinterm_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::vw_iso_dissipation_x4(float* _RESTRICT VWinterm,
+	const float* _RESTRICT const VWinterm_x,
+	const float* _RESTRICT const VWinterm_y,
+	const float* _RESTRICT const VWinterm_z,
+	const wstGrid3d< float >& grid);
+template void nse::vw_iso_dissipation_x4(double* _RESTRICT VWinterm,
+	const double* _RESTRICT const VWinterm_x,
+	const double* _RESTRICT const VWinterm_y,
+	const double* _RESTRICT const VWinterm_z,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: divergence * //
+template void nse::divergence_x4(float* _RESTRICT Div,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::divergence_x4(double* _RESTRICT Div,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: gradient * //
+template void nse::u_sub_gradient_x4(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const X,
+	const float c_gradient, const wstGrid3d< float >& grid);
+template void nse::u_sub_gradient_x4(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const X,
+	const double c_gradient, const wstGrid3d< double >& grid);
+
+template void nse::v_sub_gradient_x4(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const X,
+	const float c_gradient, const wstGrid3d< float >& grid);
+template void nse::v_sub_gradient_x4(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const X,
+	const double c_gradient, const wstGrid3d< double >& grid);
+
+template void nse::w_sub_gradient_x4(float* _RESTRICT Winterm,
+	const float* _RESTRICT const X,
+	const float c_gradient, const wstGrid3d< float >& grid);
+template void nse::w_sub_gradient_x4(double* _RESTRICT Winterm,
+	const double* _RESTRICT const X,
+	const double c_gradient, const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: poisson eq. rhs * //
+template void nse::poisson_rhs_x4(float* _RESTRICT Rhs,
+	const float* _RESTRICT const Div,
+	const float* _RESTRICT const Uinterm, const float* _RESTRICT const Vinterm, const float* _RESTRICT const Winterm,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::poisson_rhs_x4(double* _RESTRICT Rhs,
+	const double* _RESTRICT const Div,
+	const double* _RESTRICT const Uinterm, const double* _RESTRICT const Vinterm, const double* _RESTRICT const Winterm,
+	const wstGrid3d< double >& grid, const double dt);
+// ------------------------------------------------------------------------ //
+
+// * initialize: projection * //
+template void nse::u_projection_x4(float* _RESTRICT U,
+	const float* _RESTRICT const Uinterm, const float* _RESTRICT const Phi,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::u_projection_x4(double* _RESTRICT U,
+	const double* _RESTRICT const Uinterm, const double* _RESTRICT const Phi,
+	const wstGrid3d< double >& grid, const double dt);
+
+template void nse::v_projection_x4(float* _RESTRICT V,
+	const float* _RESTRICT const Vinterm, const float* _RESTRICT const Phi,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::v_projection_x4(double* _RESTRICT V,
+	const double* _RESTRICT const Vinterm, const double* _RESTRICT const Phi,
+	const wstGrid3d< double >& grid, const double dt);
+
+template void nse::w_projection_x4(float* _RESTRICT W,
+	const float* _RESTRICT const Winterm, const float* _RESTRICT const Phi,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::w_projection_x4(double* _RESTRICT W,
+	const double* _RESTRICT const Winterm, const double* _RESTRICT const Phi,
+	const wstGrid3d< double >& grid, const double dt);
+// ------------------------------------------------------------------------ //
+
+// * initialize: buoyancy * //
+template void nse::u_buoyancy_x4(float* _RESTRICT Uinterm, const float* _RESTRICT const X,
+	const float c_gravity_x, const wstGrid3d< float >& grid);
+template void nse::u_buoyancy_x4(double* _RESTRICT Uinterm, const double* _RESTRICT const X,
+	const double c_gravity_x, const wstGrid3d< double >& grid);
+
+template void nse::v_buoyancy_x4(float* _RESTRICT Vinterm, const float* _RESTRICT const X,
+	const float c_gravity_y, const wstGrid3d< float >& grid);
+template void nse::v_buoyancy_x4(double* _RESTRICT Vinterm, const double* _RESTRICT const X,
+	const double c_gravity_y, const wstGrid3d< double >& grid);
+
+template void nse::w_buoyancy_x4(float* _RESTRICT Winterm, const float* _RESTRICT const X,
+	const float c_gravity_z, const wstGrid3d< float >& grid);
+template void nse::w_buoyancy_x4(double* _RESTRICT Winterm, const double* _RESTRICT const X,
+	const double c_gravity_z, const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: coriolis * //
+template void nse::u_coriolis_x4(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_coriolis_x, const float c_coriolis_y, const float c_coriolis_z,
+	const wstGrid3d< float >& grid);
+template void nse::u_coriolis_x4(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_coriolis_x, const double c_coriolis_y, const double c_coriolis_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::v_coriolis_x4(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_coriolis_x, const float c_coriolis_y, const float c_coriolis_z,
+	const wstGrid3d< float >& grid);
+template void nse::v_coriolis_x4(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_coriolis_x, const double c_coriolis_y, const double c_coriolis_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::w_coriolis_x4(float* _RESTRICT Winterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_coriolis_x, const float c_coriolis_y, const float c_coriolis_z,
+	const wstGrid3d< float >& grid);
+template void nse::w_coriolis_x4(double* _RESTRICT Winterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_coriolis_x, const double c_coriolis_y, const double c_coriolis_z,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: kinetic energy * //
+template float nse::kinetic_energy_x4(
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template double nse::kinetic_energy_x4(
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: vorticity * //
+template void nse::vorticity_x4(
+	float* _RESTRICT Vorticity_x, float* _RESTRICT Vorticity_y, float* _RESTRICT Vorticity_z,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::vorticity_x4(
+	double* _RESTRICT Vorticity_x, double* _RESTRICT Vorticity_y, double* _RESTRICT Vorticity_z,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: special field products * //
+template void nse::uw_advection_x4(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< float >& grid);
+template void nse::uw_advection_x4(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< double >& grid);
+
+template void nse::vw_advection_x4(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< float >& grid);
+template void nse::vw_advection_x4(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: scalar-pressure gradient * //
+template void nse::c_u_pressure_gradient_x4(float* _RESTRICT C_dPdx,
+	const float* _RESTRICT const X, const float* _RESTRICT const Pressure,
+	const wstGrid3d< float >& grid);
+template void nse::c_u_pressure_gradient_x4(double* _RESTRICT C_dPdx,
+	const double* _RESTRICT const X, const double* _RESTRICT const Pressure,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_v_pressure_gradient_x4(float* _RESTRICT C_dPdy,
+	const float* _RESTRICT const X, const float* _RESTRICT const Pressure,
+	const wstGrid3d< float >& grid);
+template void nse::c_v_pressure_gradient_x4(double* _RESTRICT C_dPdy,
+	const double* _RESTRICT const X, const double* _RESTRICT const Pressure,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_w_pressure_gradient_x4(float* _RESTRICT C_dPdz,
+	const float* _RESTRICT const X, const float* _RESTRICT const Pressure,
+	const wstGrid3d< float >& grid);
+template void nse::c_w_pressure_gradient_x4(double* _RESTRICT C_dPdz,
+	const double* _RESTRICT const X, const double* _RESTRICT const Pressure,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------- //
+
+// * initialize: pressure-strain tensor * //
+template void nse::pressure_strain_diag_x4(float* _RESTRICT PU, float* _RESTRICT PV, float* _RESTRICT PW,
+	const float* _RESTRICT const Pressure,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::pressure_strain_diag_x4(double* _RESTRICT PU, double* _RESTRICT PV, double* _RESTRICT PW,
+	const double* _RESTRICT const Pressure,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::pressure_strain_uv_x4(float* _RESTRICT PS_UV,
+	const float* _RESTRICT const Pressure,
+	const float* _RESTRICT const U, const float* _RESTRICT const V,
+	const wstGrid3d< float >& grid);
+template void nse::pressure_strain_uv_x4(double* _RESTRICT PS_UV,
+	const double* _RESTRICT const Pressure,
+	const double* _RESTRICT const U, const double* _RESTRICT const V,
+	const wstGrid3d< double >& grid);
+
+template void nse::pressure_strain_uw_x4(float* _RESTRICT PS_UW,
+	const float* _RESTRICT const Pressure,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::pressure_strain_uw_x4(double* _RESTRICT PS_UW,
+	const double* _RESTRICT const Pressure,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::pressure_strain_vw_x4(float* _RESTRICT PS_VW,
+	const float* _RESTRICT const Pressure,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::pressure_strain_vw_x4(double* _RESTRICT PS_VW,
+	const double* _RESTRICT const Pressure,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
diff --git a/nse3d-x4.h b/nse3d-x4.h
new file mode 100644
index 0000000000000000000000000000000000000000..51d45c5a01a1735b20e3cdaf1cc623ffb29f4a61
--- /dev/null
+++ b/nse3d-x4.h
@@ -0,0 +1,382 @@
+#pragma once
+
+// [nse3d-x4.h(cpp)]: 3D Navier-Stokes module -X4
+//
+// -------------------------------------------------------------------------------------------- //
+
+
+#include "nse-sys.h"
+#include "wstgrid3d.h"
+
+
+namespace nse
+{
+	// * velocity advection [ := - ] * //
+	template< typename T >
+	void u_advection_div_x4(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_advection_div_x4(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_advection_div_x4(T* _RESTRICT Winterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void u_advection_skew_x4(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_advection_skew_x4(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_advection_skew_x4(T* _RESTRICT Winterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * scalar advection [ := - ] * //
+	template< typename T >
+	void c_advection_div_x4(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T* _RESTRICT const X,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void c_advection_skew_x4(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T* _RESTRICT const X,
+		const wstGrid3d< T >& grid);
+
+	// * correction for exclusion of vertical profile: + W * //
+	template< typename T >
+	void c_advection_div_vline_x4(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T X0, const T XH, const wstGrid3d< T >& grid);
+	template< typename T >
+	void c_advection_skew_vline_x4(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T X0, const T XH, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * diffusion [ +=, := ] * //
+	template< typename T >
+	void u_add_diffusion_x4(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_add_diffusion_x4(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const V,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_add_diffusion_x4(T* _RESTRICT Winterm,
+		const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void c_add_diffusion_x4(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const X,
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void u_set_diffusion_x4(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_set_diffusion_x4(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const V,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_set_diffusion_x4(T* _RESTRICT Winterm,
+		const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void c_set_diffusion_x4(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const X,
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * dissipation operator [ := Ui * nu * div(grad(Ui)) ] * //
+	template< typename T >
+	void u_dissipation_x4(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_dissipation_x4(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const V,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_dissipation_x4(T* _RESTRICT Winterm,
+		const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void c_dissipation_x4(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const X,
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void uw_dissipation_x4(T* _RESTRICT UWinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const T* _RESTRICT const U_diffusion,
+		const T* _RESTRICT const W_diffusion,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_dissipation_x4(T* _RESTRICT VWinterm,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T* _RESTRICT const V_diffusion,
+		const T* _RESTRICT const W_diffusion,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uv_dissipation_x4(T* _RESTRICT UVinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V,
+		const T* _RESTRICT const U_diffusion,
+		const T* _RESTRICT const V_diffusion,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cu_dissipation_x4(T* _RESTRICT CUinterm,
+		const T* _RESTRICT const X, const T* _RESTRICT const U,
+		const T* _RESTRICT const X_diffusion,
+		const T* _RESTRICT const U_diffusion,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void cv_dissipation_x4(T* _RESTRICT CVinterm,
+		const T* _RESTRICT const X, const T* _RESTRICT const V,
+		const T* _RESTRICT const X_diffusion,
+		const T* _RESTRICT const V_diffusion,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void cw_dissipation_x4(T* _RESTRICT CWinterm,
+		const T* _RESTRICT const X, const T* _RESTRICT const W,
+		const T* _RESTRICT const X_diffusion,
+		const T* _RESTRICT const W_diffusion,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * isotropic dissipation operator [ := grad(Ui)*grad(Ui) ] * //
+	template< typename T >
+	void u_iso_dissipation_x4(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_iso_dissipation_x4(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const V,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_iso_dissipation_x4(T* _RESTRICT Winterm,
+		const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void c_iso_dissipation_x4(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const X,
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+
+	// := 2 * nu * grad(Ui)*grad(Uj)
+	template< typename T >
+	void uv_iso_dissipation_components_x4(
+		T* _RESTRICT UVinterm_x,			// node: [V]
+		T* _RESTRICT UVinterm_y,			// node: [U]
+		T* _RESTRICT UVinterm_z,			// node: [UVW]
+
+		const T* _RESTRICT const U, const T* _RESTRICT const V,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uw_iso_dissipation_components_x4(
+		T* _RESTRICT UWinterm_x,			// node: [W]
+		T* _RESTRICT UWinterm_y,			// node: [UVW]
+		T* _RESTRICT UWinterm_z,			// node: [U]
+
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_iso_dissipation_components_x4(
+		T* _RESTRICT VWinterm_x,			// node: [UVW]
+		T* _RESTRICT VWinterm_y,			// node: [W]
+		T* _RESTRICT VWinterm_z,			// node: [V]
+
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void uv_iso_dissipation_x4(T* _RESTRICT UVinterm,	// node: [UV]
+		const T* _RESTRICT const UVinterm_x,			// node: [V]
+		const T* _RESTRICT const UVinterm_y,			// node: [U]
+		const T* _RESTRICT const UVinterm_z,			// node: [UVW]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uw_iso_dissipation_x4(T* _RESTRICT UWinterm,	// node: [UW]
+		const T* _RESTRICT const UWinterm_x,			// node: [W]
+		const T* _RESTRICT const UWinterm_y,			// node: [UVW]
+		const T* _RESTRICT const UWinterm_z,			// node: [U]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_iso_dissipation_x4(T* _RESTRICT VWinterm,	// node: [VW]
+		const T* _RESTRICT const VWinterm_x,			// node: [UVW]
+		const T* _RESTRICT const VWinterm_y,			// node: [W]
+		const T* _RESTRICT const VWinterm_z,			// node: [V]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * divergence [ := + ] * //
+	template< typename T >
+	void divergence_x4(T* _RESTRICT Div,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * gradient [ -=, += ] * //
+	template< typename T >
+	void u_sub_gradient_x4(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const X,
+		const T c_gradient, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_sub_gradient_x4(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const X,
+		const T c_gradient, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_sub_gradient_x4(T* _RESTRICT Winterm,
+		const T* _RESTRICT const X,
+		const T c_gradient, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * poisson equation rhs [ := ] * //
+	template< typename T >
+	void poisson_rhs_x4(T* _RESTRICT Rhs,
+		const T* _RESTRICT const Div,
+		const T* _RESTRICT const Uinterm, const T* _RESTRICT const Vinterm, const T* _RESTRICT const Winterm,
+		const wstGrid3d< T >& grid, const T dt);
+	// -------------------------------------------------------------------- //
+
+	// * velocity projection * //
+	template< typename T >
+	void u_projection_x4(T* _RESTRICT U,
+		const T* _RESTRICT const Uinterm, const T* _RESTRICT const Phi,
+		const wstGrid3d< T >& grid, const T dt);
+	template< typename T >
+	void v_projection_x4(T* _RESTRICT V,
+		const T* _RESTRICT const Vinterm, const T* _RESTRICT const Phi,
+		const wstGrid3d< T >& grid, const T dt);
+	template< typename T >
+	void w_projection_x4(T* _RESTRICT W,
+		const T* _RESTRICT const Winterm, const T* _RESTRICT const Phi,
+		const wstGrid3d< T >& grid, const T dt);
+	// -------------------------------------------------------------------- //
+
+
+	// * buoyancy  [ -= ] * //
+	template< typename T >
+	void u_buoyancy_x4(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const X,
+		const T c_gravity_x, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_buoyancy_x4(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const X,
+		const T c_gravity_y, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_buoyancy_x4(T* _RESTRICT Winterm,
+		const T* _RESTRICT const X,
+		const T c_gravity_z, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * coriolis [ -= ] * //
+	template< typename T >
+	void u_coriolis_x4(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_coriolis_x4(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_coriolis_x4(T* _RESTRICT Winterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * kinetic energy [ := + ] * //
+	template< typename T >
+	T kinetic_energy_x4(
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * vorticity * //
+	template< typename T >	// -> [VW, UW, UV] nodes
+	void vorticity_x4(T* _RESTRICT Vorticity_x, T* _RESTRICT Vorticity_y, T* _RESTRICT Vorticity_z,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * special field products * //
+	template< typename T >	// = W * (dU/dz) [node: UW]
+	void uw_advection_x4(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const wstGrid3d< T >& grid);
+	template< typename T >	// = W * (dV/dz) [node: VW]
+	void vw_advection_x4(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const wstGrid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+
+	// * scalar-pressure gradient * //
+	template< typename T >	// [ C*dP/dx ] [-> node: U]
+	void c_u_pressure_gradient_x4(T* _RESTRICT C_dPdx,
+		const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+		const wstGrid3d< T >& grid);
+	template< typename T >	// [ C*dP/dy ] [-> node: V]
+	void c_v_pressure_gradient_x4(T* _RESTRICT C_dPdy,
+		const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+		const wstGrid3d< T >& grid);
+	template< typename T >	// [ C*dP/dz ] [-> node: W]
+	void c_w_pressure_gradient_x4(T* _RESTRICT C_dPdz,
+		const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+		const wstGrid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+
+	// * pressure-strain tensor * //
+	template< typename T >	// [ P*(dU/dx), P*(dV/dy), P*(dW/dz) ] [-> node: C]
+	void pressure_strain_diag_x4(T* _RESTRICT PU, T* _RESTRICT PV, T* _RESTRICT PW,
+		const T* _RESTRICT const Pressure,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >	// = P * 2 * S[u,v] = P * (du/dy + dv/dx) [node: UV]
+	void pressure_strain_uv_x4(T* _RESTRICT PS_UV,
+		const T* _RESTRICT const Pressure, const T* _RESTRICT const U, const T* _RESTRICT const V,
+		const wstGrid3d< T >& grid);
+	template< typename T >	// = P * 2 * S[u,w] = P * (du/dz + dw/dx) [node: UW]
+	void pressure_strain_uw_x4(T* _RESTRICT PS_UW,
+		const T* _RESTRICT const Pressure, const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	template< typename T >	// = P * 2 * S[v,w] = P * (dv/dz + dw/dy) [node: VW]
+	void pressure_strain_vw_x4(T* _RESTRICT PS_VW,
+		const T* _RESTRICT const Pressure, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+}
diff --git a/nse3d.cpp b/nse3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d5b8653b950026de14aa4ac0ac0bb6b0af43cecd
--- /dev/null
+++ b/nse3d.cpp
@@ -0,0 +1,4262 @@
+#include "nse3d.h"
+
+#include <math.h>
+
+using namespace nse::nse_const3d;
+
+
+// * advection * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_advection_div(	// := - [d(U*Uj)/dxj]
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = -(
+					(U[idx + grid.nyz] - U[idx - grid.nyz]) *
+					(U[idx + grid.nyz] + U[idx - grid.nyz] + U[idx] + U[idx]) * grid.dxiq
+					+
+					(
+					(U[idx] + U[idx + grid.nz]) * (V[idx + grid.nz] + V[idx - grid.nyz + grid.nz]) -
+					(U[idx] + U[idx - grid.nz]) * (V[idx] + V[idx - grid.nyz])
+					) * grid.dyiq
+					+
+					(
+					(U[idx] + U[idx + 1]) * (W[idx + 1] + W[idx - grid.nyz + 1]) -
+					(U[idx] + U[idx - 1]) * (W[idx] + W[idx - grid.nyz])
+					) * grid.dziq[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_advection_div(	// := - [d(V*Uj)/dxj]
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = -(
+					(
+					(V[idx] + V[idx + grid.nyz]) * (U[idx + grid.nyz] + U[idx + grid.nyz - grid.nz]) -
+					(V[idx] + V[idx - grid.nyz]) * (U[idx] + U[idx - grid.nz])
+					) * grid.dxiq
+					+
+					(V[idx + grid.nz] - V[idx - grid.nz]) *
+					(V[idx + grid.nz] + V[idx - grid.nz] + V[idx] + V[idx]) * grid.dyiq
+					+
+					(
+					(V[idx] + V[idx + 1]) * (W[idx + 1] + W[idx - grid.nz + 1]) -
+					(V[idx] + V[idx - 1]) * (W[idx] + W[idx - grid.nz])
+					) * grid.dziq[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_advection_div(	// := - [d(W*Uj)/dxj]
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = -(
+					(
+					(W[idx] + W[idx + grid.nyz]) * (U[idx + grid.nyz] + U[idx + grid.nyz - 1]) -
+					(W[idx] + W[idx - grid.nyz]) * (U[idx] + U[idx - 1])
+					) * grid.dxiq
+					+
+					(
+					(W[idx] + W[idx + grid.nz]) * (V[idx + grid.nz] + V[idx + grid.nz - 1]) -
+					(W[idx] + W[idx - grid.nz]) * (V[idx] + V[idx - 1])
+					) * grid.dyiq
+					+
+					(W[idx + 1] - W[idx - 1]) *
+					(W[idx + 1] + W[idx - 1] + W[idx] + W[idx]) * grid.dzmih[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::u_advection_skew(	// := - [d(U*Uj)/dxj]
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = -(
+					(
+					U[idx + grid.nyz] * (U[idx] + U[idx + grid.nyz]) -
+					U[idx - grid.nyz] * (U[idx] + U[idx - grid.nyz])
+					) * grid.dxiq
+					+
+					(
+					U[idx + grid.nz] * (V[idx + grid.nz] + V[idx - grid.nyz + grid.nz]) -
+					U[idx - grid.nz] * (V[idx] + V[idx - grid.nyz])
+					) * grid.dyiq
+					+
+					(
+					U[idx + 1] * (W[idx + 1] + W[idx - grid.nyz + 1]) -
+					U[idx - 1] * (W[idx] + W[idx - grid.nyz])
+					) * grid.dziq[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_advection_skew(	// := - [d(V*Uj)/dxj]
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = -(
+					(
+					V[idx + grid.nyz] * (U[idx + grid.nyz] + U[idx + grid.nyz - grid.nz]) -
+					V[idx - grid.nyz] * (U[idx] + U[idx - grid.nz])
+					) * grid.dxiq
+					+
+					(
+					V[idx + grid.nz] * (V[idx] + V[idx + grid.nz]) -
+					V[idx - grid.nz] * (V[idx] + V[idx - grid.nz])
+					) * grid.dyiq
+					+
+					(
+					V[idx + 1] * (W[idx + 1] + W[idx - grid.nz + 1]) -
+					V[idx - 1] * (W[idx] + W[idx - grid.nz])
+					) * grid.dziq[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_advection_skew(	// := - [d(W*Uj)/dxj]
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = -(
+					(
+					W[idx + grid.nyz] * (U[idx + grid.nyz] + U[idx + grid.nyz - 1]) -
+					W[idx - grid.nyz] * (U[idx] + U[idx - 1])
+					) * grid.dxiq
+					+
+					(
+					W[idx + grid.nz] * (V[idx + grid.nz] + V[idx + grid.nz - 1]) -
+					W[idx - grid.nz] * (V[idx] + V[idx - 1])
+					) * grid.dyiq
+					+
+					(
+					W[idx + 1] * (W[idx] + W[idx + 1]) -
+					W[idx - 1] * (W[idx] + W[idx - 1])
+					) * grid.dzmih[k]);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * scalar advection * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::c_advection_div(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T* _RESTRICT const X,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = -(
+					(
+					U[idx + grid.nyz] * (X[idx + grid.nyz] + X[idx]) -
+					U[idx] * (X[idx] + X[idx - grid.nyz])
+					) * grid.dxih
+					+
+					(
+					V[idx + grid.nz] * (X[idx + grid.nz] + X[idx]) -
+					V[idx] * (X[idx] + X[idx - grid.nz])
+					) * grid.dyih
+					+
+					(
+					W[idx + 1] * (X[idx + 1] + X[idx]) -
+					W[idx] * (X[idx] + X[idx - 1])
+					) * grid.dzih[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_advection_adv(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T* _RESTRICT const X,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = -(
+					(U[idx + grid.nyz] * (X[idx + grid.nyz] - X[idx]) + U[idx] * (X[idx] - X[idx - grid.nyz])) * grid.dxih +
+					(V[idx + grid.nz] * (X[idx + grid.nz] - X[idx]) + V[idx] * (X[idx] - X[idx - grid.nz])) * grid.dyih +
+					(W[idx + 1] * (X[idx + 1] - X[idx]) + W[idx] * (X[idx] - X[idx - 1])) * grid.dzih[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_advection_skew(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T* _RESTRICT const X,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = -(
+					(U[idx + grid.nyz] * X[idx + grid.nyz] - U[idx] * X[idx - grid.nyz]) * grid.dxih +
+					(V[idx + grid.nz] * X[idx + grid.nz] - V[idx] * X[idx - grid.nz]) * grid.dyih +
+					(W[idx + 1] * X[idx + 1] - W[idx] * X[idx - 1]) * grid.dzih[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_advection_div_vline(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T X0, const T XH,
+	const wstGrid3d< T >& grid)
+{
+	const T Ch = (XH - X0) / grid.mpi_height;
+	T div_k;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx, div_k ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx, div_k ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				// adding divergence operator for consistency
+				div_k = (X0 + Ch * grid.pz[k]) * (
+					(U[idx + grid.nyz] - U[idx]) * grid.dxi +
+					(V[idx + grid.nz] - V[idx]) * grid.dyi +
+					(W[idx + 1] - W[idx]) * grid.dzi[k]);
+
+				// (DIV-x4) difference in divergence only
+				Xinterm[idx] -= div_k + Ch * (
+					W[idx + 1] * (grid.dz[k] + grid.dz[k + 1]) +
+					W[idx] * (grid.dz[k] + grid.dz[k - 1])) * grid.dziq[k];
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_advection_skew_vline(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T X0, const T XH,
+	const wstGrid3d< T >& grid)
+{
+	const T Ch = (XH - X0) / grid.mpi_height;
+	T div_k;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx, div_k ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx, div_k ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				// adding divergence operator for consistency
+				div_k = (T) 0.5 * (X0 + Ch * grid.pz[k]) * (
+					(U[idx + grid.nyz] - U[idx]) * grid.dxi +
+					(V[idx + grid.nz] - V[idx]) * grid.dyi +
+					(W[idx + 1] - W[idx]) * grid.dzi[k]);
+
+				// (SKEW-x4) difference in divergence only
+				Xinterm[idx] -= div_k + Ch * (
+					W[idx + 1] * (grid.dz[k] + grid.dz[k + 1]) +
+					W[idx] * (grid.dz[k] + grid.dz[k - 1])) * grid.dziq[k];
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * diffusion * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_add_diffusion(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] += c_viscosity * (
+					(U[idx + grid.nyz] - U[idx] - U[idx] + U[idx - grid.nyz]) * grid.dx2i +
+					(U[idx + grid.nz] - U[idx] - U[idx] + U[idx - grid.nz]) * grid.dy2i +
+
+					((U[idx + 1] - U[idx]) * grid.dzp2i[k]
+					- (U[idx] - U[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_add_diffusion(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const V,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] += c_viscosity * (
+					(V[idx + grid.nyz] - V[idx] - V[idx] + V[idx - grid.nyz]) * grid.dx2i +
+					(V[idx + grid.nz] - V[idx] - V[idx] + V[idx - grid.nz]) * grid.dy2i +
+
+					((V[idx + 1] - V[idx]) * grid.dzp2i[k]
+					- (V[idx] - V[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_add_diffusion(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] += c_viscosity * (
+					(W[idx + grid.nyz] - W[idx] - W[idx] + W[idx - grid.nyz]) * grid.dx2i +
+					(W[idx + grid.nz] - W[idx] - W[idx] + W[idx - grid.nz]) * grid.dy2i +
+
+					((W[idx + 1] - W[idx]) * grid.dzm2i[k]
+					- (W[idx] - W[idx - 1]) * grid.dzp2i[k - 1]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_add_diffusion(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const X,
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] += c_diffusivity * (
+					(X[idx + grid.nyz] - X[idx] - X[idx] + X[idx - grid.nyz]) * grid.dx2i +
+					(X[idx + grid.nz] - X[idx] - X[idx] + X[idx - grid.nz]) * grid.dy2i +
+
+					((X[idx + 1] - X[idx]) * grid.dzp2i[k]
+					- (X[idx] - X[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+
+
+template< typename T >
+void nse::u_set_diffusion(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = c_viscosity * (
+					(U[idx + grid.nyz] - U[idx] - U[idx] + U[idx - grid.nyz]) * grid.dx2i +
+					(U[idx + grid.nz] - U[idx] - U[idx] + U[idx - grid.nz]) * grid.dy2i +
+
+					((U[idx + 1] - U[idx]) * grid.dzp2i[k]
+					- (U[idx] - U[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_set_diffusion(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const V,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = c_viscosity * (
+					(V[idx + grid.nyz] - V[idx] - V[idx] + V[idx - grid.nyz]) * grid.dx2i +
+					(V[idx + grid.nz] - V[idx] - V[idx] + V[idx - grid.nz]) * grid.dy2i +
+
+					((V[idx + 1] - V[idx]) * grid.dzp2i[k]
+					- (V[idx] - V[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_set_diffusion(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = c_viscosity * (
+					(W[idx + grid.nyz] - W[idx] - W[idx] + W[idx - grid.nyz]) * grid.dx2i +
+					(W[idx + grid.nz] - W[idx] - W[idx] + W[idx - grid.nz]) * grid.dy2i +
+
+					((W[idx + 1] - W[idx]) * grid.dzm2i[k]
+					- (W[idx] - W[idx - 1]) * grid.dzp2i[k - 1]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_set_diffusion(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const X,
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = c_diffusivity * (
+					(X[idx + grid.nyz] - X[idx] - X[idx] + X[idx - grid.nyz]) * grid.dx2i +
+					(X[idx + grid.nz] - X[idx] - X[idx] + X[idx - grid.nz]) * grid.dy2i +
+
+					((X[idx + 1] - X[idx]) * grid.dzp2i[k]
+					- (X[idx] - X[idx - 1]) * grid.dzm2i[k]));
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * dissipation operator [ := ] * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_dissipation(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = c_viscosity * (
+					U[idx] * (
+					(U[idx + grid.nyz] - U[idx] - U[idx] + U[idx - grid.nyz]) * grid.dx2i +
+					(U[idx + grid.nz] - U[idx] - U[idx] + U[idx - grid.nz]) * grid.dy2i +
+
+					((U[idx + 1] - U[idx]) * grid.dzp2i[k]
+					- (U[idx] - U[idx - 1]) * grid.dzm2i[k])));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_dissipation(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const V,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = c_viscosity * (
+					V[idx] * (
+					(V[idx + grid.nyz] - V[idx] - V[idx] + V[idx - grid.nyz]) * grid.dx2i +
+					(V[idx + grid.nz] - V[idx] - V[idx] + V[idx - grid.nz]) * grid.dy2i +
+
+					((V[idx + 1] - V[idx]) * grid.dzp2i[k]
+					- (V[idx] - V[idx - 1]) * grid.dzm2i[k])));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_dissipation(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = c_viscosity * (
+					W[idx] * (
+					(W[idx + grid.nyz] - W[idx] - W[idx] + W[idx - grid.nyz]) * grid.dx2i +
+					(W[idx + grid.nz] - W[idx] - W[idx] + W[idx - grid.nz]) * grid.dy2i +
+
+					((W[idx + 1] - W[idx]) * grid.dzm2i[k]
+					- (W[idx] - W[idx - 1]) * grid.dzp2i[k - 1])));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_dissipation(
+	T* _RESTRICT Xinterm, const T* _RESTRICT const X,
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = c_diffusivity * (
+					X[idx] * (
+					(X[idx + grid.nyz] - X[idx] - X[idx] + X[idx - grid.nyz]) * grid.dx2i +
+						(X[idx + grid.nz] - X[idx] - X[idx] + X[idx - grid.nz]) * grid.dy2i +
+
+						((X[idx + 1] - X[idx]) * grid.dzp2i[k]
+							- (X[idx] - X[idx - 1]) * grid.dzm2i[k])));
+			}
+		}
+	}
+}
+
+
+template< typename T >
+void nse::uw_dissipation(T* _RESTRICT UWinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const T* _RESTRICT const U_diffusion,
+	const T* _RESTRICT const W_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UWinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UWinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				UWinterm[idx] = (T)0.25 * (
+					(W[idx] + W[idx - grid.nyz]) * (U_diffusion[idx] + U_diffusion[idx - 1]) +
+					(U[idx] + U[idx - 1]) * (W_diffusion[idx] + W_diffusion[idx - grid.nyz]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::vw_dissipation(T* _RESTRICT VWinterm,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T* _RESTRICT const V_diffusion,
+	const T* _RESTRICT const W_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VWinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VWinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				VWinterm[idx] = (T)0.25 * (
+					(W[idx] + W[idx - grid.nz]) * (V_diffusion[idx] + V_diffusion[idx - 1]) +
+					(V[idx] + V[idx - 1]) * (W_diffusion[idx] + W_diffusion[idx - grid.nz]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uv_dissipation(T* _RESTRICT UVinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V,
+	const T* _RESTRICT const U_diffusion,
+	const T* _RESTRICT const V_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UVinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UVinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				UVinterm[idx] = (T)0.25 * (
+					(U[idx] + U[idx - grid.nz]) * (V_diffusion[idx] + V_diffusion[idx - grid.nyz]) +
+					(V[idx] + V[idx - grid.nyz]) * (U_diffusion[idx] + U_diffusion[idx - grid.nz]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::cu_dissipation(T* _RESTRICT CUinterm,
+	const T* _RESTRICT const X, const T* _RESTRICT const U,
+	const T* _RESTRICT const X_diffusion,
+	const T* _RESTRICT const U_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CUinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CUinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				CUinterm[idx] = (T)0.5 * (
+					(X[idx] + X[idx - grid.nyz]) * U_diffusion[idx] +
+					U[idx] * (X_diffusion[idx] + X_diffusion[idx - grid.nyz]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::cv_dissipation(T* _RESTRICT CVinterm,
+	const T* _RESTRICT const X, const T* _RESTRICT const V,
+	const T* _RESTRICT const X_diffusion,
+	const T* _RESTRICT const V_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CVinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CVinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				CVinterm[idx] = (T)0.5 * (
+					(X[idx] + X[idx - grid.nz]) * V_diffusion[idx] +
+					V[idx] * (X_diffusion[idx] + X_diffusion[idx - grid.nz]));
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::cw_dissipation(T* _RESTRICT CWinterm,
+	const T* _RESTRICT const X, const T* _RESTRICT const W,
+	const T* _RESTRICT const X_diffusion,
+	const T* _RESTRICT const W_diffusion,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CWinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CWinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				CWinterm[idx] = (T)0.5 * (
+					(X[idx] + X[idx - 1]) * W_diffusion[idx] +
+					W[idx] * (X_diffusion[idx] + X_diffusion[idx - 1]));
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * isotropic dissipation operator [ := ] * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_iso_dissipation(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = c_viscosity * (
+					(
+					(U[idx + grid.nyz] - U[idx]) * (U[idx + grid.nyz] - U[idx]) +
+					(U[idx] - U[idx - grid.nyz]) * (U[idx] - U[idx - grid.nyz])
+					) * grid.dx2ih +
+					(
+					(U[idx + grid.nz] - U[idx]) * (U[idx + grid.nz] - U[idx]) +
+					(U[idx] - U[idx - grid.nz]) * (U[idx] - U[idx - grid.nz])
+					) * grid.dy2ih +
+
+					(T)0.5*(U[idx + 1] - U[idx]) * (U[idx + 1] - U[idx]) * grid.dzp2i[k] +
+					(T)0.5*(U[idx] - U[idx - 1]) * (U[idx] - U[idx - 1]) * grid.dzm2i[k]
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_iso_dissipation(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const V,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = c_viscosity * (
+					(
+					(V[idx + grid.nyz] - V[idx]) * (V[idx + grid.nyz] - V[idx]) +
+					(V[idx] - V[idx - grid.nyz]) * (V[idx] - V[idx - grid.nyz])
+					) * grid.dx2ih +
+					(
+					(V[idx + grid.nz] - V[idx]) * (V[idx + grid.nz] - V[idx]) +
+					(V[idx] - V[idx - grid.nz]) * (V[idx] - V[idx - grid.nz])
+					) * grid.dy2ih +
+
+					(T)0.5*(V[idx + 1] - V[idx]) * (V[idx + 1] - V[idx]) * grid.dzp2i[k] +
+					(T)0.5*(V[idx] - V[idx - 1]) * (V[idx] - V[idx - 1]) * grid.dzm2i[k]
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_iso_dissipation(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = c_viscosity * (
+					(
+					(W[idx + grid.nyz] - W[idx]) * (W[idx + grid.nyz] - W[idx]) +
+					(W[idx] - W[idx - grid.nyz]) * (W[idx] - W[idx - grid.nyz])
+					) * grid.dx2ih +
+					(
+					(W[idx + grid.nz] - W[idx]) * (W[idx + grid.nz] - W[idx]) +
+					(W[idx] - W[idx - grid.nz]) * (W[idx] - W[idx - grid.nz])
+					) * grid.dy2ih +
+
+					(T)0.5*(W[idx + 1] - W[idx]) * (W[idx + 1] - W[idx]) * grid.dzm2i[k] +
+					(T)0.5*(W[idx] - W[idx - 1]) * (W[idx] - W[idx - 1]) * grid.dzp2i[k - 1]
+					);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::c_iso_dissipation(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const X,
+	const T c_diffusivity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Xinterm[idx] = c_diffusivity * (
+					(
+					(X[idx + grid.nyz] - X[idx]) * (X[idx + grid.nyz] - X[idx]) +
+					(X[idx] - X[idx - grid.nyz]) * (X[idx] - X[idx - grid.nyz])
+					) * grid.dx2ih +
+					(
+					(X[idx + grid.nz] - X[idx]) * (X[idx + grid.nz] - X[idx]) +
+					(X[idx] - X[idx - grid.nz]) * (X[idx] - X[idx - grid.nz])
+					) * grid.dy2ih +
+
+					(T)0.5*(X[idx + 1] - X[idx]) * (X[idx + 1] - X[idx]) * grid.dzp2i[k] +
+					(T)0.5*(X[idx] - X[idx - 1]) * (X[idx] - X[idx - 1]) * grid.dzm2i[k]
+					);
+			}
+		}
+	}
+}
+
+
+template< typename T >
+void nse::uv_iso_dissipation_components(
+	T* _RESTRICT UVinterm_x,			// node: [V]
+	T* _RESTRICT UVinterm_y,			// node: [U]
+	T* _RESTRICT UVinterm_z,			// node: [UVW]
+
+	const T* _RESTRICT const U, const T* _RESTRICT const V,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UVinterm_x, UVinterm_y, UVinterm_z ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UVinterm_x, UVinterm_y, UVinterm_z )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+				UVinterm_x[idx] = (T)2.0 * c_viscosity * (
+					((U[idx + grid.nyz] - U[idx]) + (U[idx + grid.nyz - grid.nz] - U[idx - grid.nz])) *
+					((V[idx + grid.nyz] - V[idx]) + (V[idx] - V[idx - grid.nyz])) * grid.dx2iq);
+
+				UVinterm_y[idx] = (T)2.0 * c_viscosity * (
+					((U[idx + grid.nz] - U[idx]) + (U[idx] - U[idx - grid.nz])) *
+					((V[idx + grid.nz] - V[idx]) + (V[idx - grid.nyz + grid.nz] - V[idx - grid.nyz])) * grid.dy2iq);
+
+				UVinterm_z[idx] = (T)2.0 * c_viscosity * (
+					((U[idx] - U[idx - 1]) + (U[idx - grid.nz] - U[idx - grid.nz - 1])) *
+					((V[idx] - V[idx - 1]) + (V[idx - grid.nyz] - V[idx - grid.nyz - 1]))
+					) * grid.dzmi[k] * grid.dzmi[k];
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uw_iso_dissipation_components(
+	T* _RESTRICT UWinterm_x,			// node: [W]
+	T* _RESTRICT UWinterm_y,			// node: [UVW]
+	T* _RESTRICT UWinterm_z,			// node: [U]
+
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UWinterm_x, UWinterm_y, UWinterm_z ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UWinterm_x, UWinterm_y, UWinterm_z )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				UWinterm_x[idx] = (T)2.0 * c_viscosity * (
+					((U[idx + grid.nyz] - U[idx]) + (U[idx + grid.nyz - 1] - U[idx - 1])) *
+					((W[idx + grid.nyz] - W[idx]) + (W[idx] - W[idx - grid.nyz])) * grid.dx2iq);
+
+				UWinterm_y[idx] = (T)2.0 * c_viscosity * (
+					((U[idx] - U[idx - grid.nz]) + (U[idx - 1] - U[idx - grid.nz - 1])) *
+					((W[idx] - W[idx - grid.nz]) + (W[idx - grid.nyz] - W[idx - grid.nyz - grid.nz])) * grid.dy2iq);
+
+				UWinterm_z[idx] = (T)2.0 * c_viscosity * (
+					((U[idx + 1] - U[idx]) * grid.dzpi[k] + (U[idx] - U[idx - 1]) * grid.dzmi[k]) *
+					((W[idx + 1] - W[idx]) + (W[idx - grid.nyz + 1] - W[idx - grid.nyz])) * grid.dzih[k]);
+			}
+		}
+	}
+}
+
+
+template< typename T >
+void nse::vw_iso_dissipation_components(
+	T* _RESTRICT VWinterm_x,			// node: [UVW]
+	T* _RESTRICT VWinterm_y,			// node: [W]
+	T* _RESTRICT VWinterm_z,			// node: [V]
+
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_viscosity, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VWinterm_x, VWinterm_y, VWinterm_z ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VWinterm_x, VWinterm_y, VWinterm_z )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				VWinterm_x[idx] = (T)2.0 * c_viscosity * (
+					((V[idx] - V[idx - grid.nyz]) + (V[idx - 1] - V[idx - grid.nyz - 1])) *
+					((W[idx] - W[idx - grid.nyz]) + (W[idx - grid.nz] - W[idx - grid.nyz - grid.nz])) * grid.dx2iq);
+
+				VWinterm_y[idx] = (T)2.0 * c_viscosity * (
+					((V[idx + grid.nz] - V[idx]) + (V[idx + grid.nz - 1] - V[idx - 1])) *
+					((W[idx + grid.nz] - W[idx]) + (W[idx] - W[idx - grid.nz])) * grid.dy2iq);
+
+				VWinterm_z[idx] = (T)2.0 * c_viscosity * (
+					((V[idx + 1] - V[idx]) * grid.dzpi[k] + (V[idx] - V[idx - 1]) * grid.dzmi[k]) *
+					((W[idx + 1] - W[idx]) + (W[idx - grid.nz + 1] - W[idx - grid.nz])) * grid.dzih[k]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uv_iso_dissipation(
+	T* _RESTRICT UVinterm,					// node: [UV]
+
+	const T* _RESTRICT const UVinterm_x,	// node: [V]
+	const T* _RESTRICT const UVinterm_y,	// node: [U]
+	const T* _RESTRICT const UVinterm_z,	// node: [UVW]
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UVinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UVinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				UVinterm[idx] =
+					(T)0.5 * (UVinterm_x[idx] + UVinterm_x[idx - grid.nyz]) +
+					(T)0.5 * (UVinterm_y[idx] + UVinterm_y[idx - grid.nz]) +
+					(T)0.5 * (UVinterm_z[idx] + UVinterm_z[idx + 1]);
+
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uw_iso_dissipation(
+	T* _RESTRICT UWinterm,					// node: [UW]
+
+	const T* _RESTRICT const UWinterm_x,	// node: [W]
+	const T* _RESTRICT const UWinterm_y,	// node: [UVW]
+	const T* _RESTRICT const UWinterm_z,	// node: [U]
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UWinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UWinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				UWinterm[idx] = 
+					(T)0.5 * (UWinterm_x[idx] + UWinterm_x[idx - grid.nyz]) +
+					(T)0.5 * (UWinterm_y[idx] + UWinterm_y[idx + grid.nz]) +
+					(T)0.5 * (UWinterm_z[idx] + UWinterm_z[idx - 1]);
+
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::vw_iso_dissipation(
+	T* _RESTRICT VWinterm,					// node: [VW]
+
+	const T* _RESTRICT const VWinterm_x,	// node: [UVW]
+	const T* _RESTRICT const VWinterm_y,	// node: [W]
+	const T* _RESTRICT const VWinterm_z,	// node: [V]
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VWinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VWinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				VWinterm[idx] =
+					(T)0.5 * (VWinterm_x[idx] + VWinterm_x[idx + grid.nyz]) +
+					(T)0.5 * (VWinterm_y[idx] + VWinterm_y[idx - grid.nz]) +
+					(T)0.5 * (VWinterm_z[idx] + VWinterm_z[idx - 1]);
+
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * divergence * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::divergence(
+	T* _RESTRICT Div,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Div ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Div )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Div[idx] =
+					(U[idx + grid.nyz] - U[idx]) * grid.dxi
+					+ (V[idx + grid.nz] - V[idx]) * grid.dyi
+					+ (W[idx + 1] - W[idx]) * grid.dzi[k];
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * gradient * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_sub_gradient(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const X,
+	const T c_gradient, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	const T c_grad_x = c_gradient * grid.dxi;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] -=
+					(X[idx] - X[idx - grid.nyz]) * c_grad_x;
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_sub_gradient(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const X,
+	const T c_gradient, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	const T c_grad_y = c_gradient * grid.dyi;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] -=
+					(X[idx] - X[idx - grid.nz]) * c_grad_y;
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_sub_gradient(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const X,
+	const T c_gradient, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	const T c_grad_z = (T) 2.0 * c_gradient;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] -= c_grad_z *
+					(X[idx] - X[idx - 1]) * grid.dzmi[k];
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * poisson equation rhs * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::poisson_rhs(
+	T* _RESTRICT Rhs,
+	const T* _RESTRICT const Div,
+	const T* _RESTRICT const Uinterm, const T* _RESTRICT const Vinterm, const T* _RESTRICT const Winterm,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	const T dtinverse = (T) 1.0 / dt;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Rhs ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Rhs )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Rhs[idx] = Div[idx] * dtinverse +
+
+					(Uinterm[idx + grid.nyz] - Uinterm[idx]) * grid.dxi
+					+ (Vinterm[idx + grid.nz] - Vinterm[idx]) * grid.dyi
+					+ (Winterm[idx + 1] - Winterm[idx]) * grid.dzi[k];
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::poisson_rhs(
+	T* _RESTRICT Rhs,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T* _RESTRICT const Uinterm, const T* _RESTRICT const Vinterm, const T* _RESTRICT const Winterm,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	T divergence;
+
+	const T dtinverse = (T) 1.0 / dt;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx, divergence ) shared( Rhs ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx, divergence ) shared( Rhs )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				divergence =
+					(U[idx + grid.nyz] - U[idx]) * grid.dxi
+					+ (V[idx + grid.nz] - V[idx]) * grid.dyi
+					+ (W[idx + 1] - W[idx]) * grid.dzi[k];
+
+				Rhs[idx] = divergence * dtinverse +
+
+					(Uinterm[idx + grid.nyz] - Uinterm[idx]) * grid.dxi
+					+ (Vinterm[idx + grid.nz] - Vinterm[idx]) * grid.dyi
+					+ (Winterm[idx + 1] - Winterm[idx]) * grid.dzi[k];
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * velocity projection * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_projection(
+	T* _RESTRICT U,
+	const T* _RESTRICT const Uinterm, const T* _RESTRICT const Phi,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( U ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( U )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				U[idx] += dt * (Uinterm[idx] -
+					(Phi[idx] - Phi[idx - grid.nyz]) * grid.dxi);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_projection(
+	T* _RESTRICT V,
+	const T* _RESTRICT const Vinterm, const T* _RESTRICT const Phi,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( V ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( V )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				V[idx] += dt * (Vinterm[idx] -
+					(Phi[idx] - Phi[idx - grid.nz]) * grid.dyi);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_projection(
+	T* _RESTRICT W,
+	const T* _RESTRICT const Winterm, const T* _RESTRICT const Phi,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( W ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( W )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				W[idx] += dt * (Winterm[idx] -
+					(Phi[idx] - Phi[idx - 1]) * (T) 2.0 * grid.dzmi[k]);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * heat dissipation * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::heat_dissipation(
+	T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_dissipation, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	T uv_deviator, uw_deviator, vw_deviator;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx) \
+	private(uv_deviator, uw_deviator, vw_deviator) shared(Xinterm) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx) \
+	private(uv_deviator, uw_deviator, vw_deviator) shared(Xinterm)
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				uv_deviator =
+					(U[idx + grid.nz] + U[idx + grid.nyz + grid.nz]
+					- U[idx - grid.nz] - U[idx + grid.nyz - grid.nz]) * grid.dyiq
+
+					+ (V[idx + grid.nyz] + V[idx + grid.nyz + grid.nz]
+					- V[idx - grid.nyz] - V[idx - grid.nyz + grid.nz]) * grid.dxiq;
+
+				uw_deviator =
+					(U[idx + 1] + U[idx + grid.nyz + 1]
+					- U[idx - 1] - U[idx + grid.nyz - 1]) * grid.dziq[k]
+
+					+ (W[idx + grid.nyz] + W[idx + grid.nyz + 1]
+					- W[idx - grid.nyz] - W[idx - grid.nyz + 1]) * grid.dxiq;
+
+				vw_deviator =
+					(V[idx + 1] + V[idx + grid.nz + 1]
+					- V[idx - 1] - V[idx + grid.nz - 1]) * grid.dziq[k]
+
+					+ (W[idx + grid.nz] + W[idx + grid.nz + 1]
+					- W[idx - grid.nz] - W[idx - grid.nz + 1]) * grid.dyiq;
+
+				Xinterm[idx] = c_dissipation * (
+					(U[idx + grid.nyz] - U[idx]) *
+					(U[idx + grid.nyz] - U[idx]) * grid.dx2id
+
+					+ (V[idx + grid.nz] - V[idx]) *
+					(V[idx + grid.nz] - V[idx]) * grid.dy2id
+
+					+ (W[idx + 1] - W[idx]) *
+					(W[idx + 1] - W[idx]) * grid.dz2id[k]
+
+					+ uv_deviator * uv_deviator
+					+ uw_deviator * uw_deviator
+					+ vw_deviator * vw_deviator);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * buoyancy * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_buoyancy(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const X,
+	const T c_gravity_x, const wstGrid3d< T >& grid)
+{
+	const T c_gx = (T) 0.5 * c_gravity_x;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] += c_gx * (X[idx] + X[idx - grid.nyz]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_buoyancy(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const X,
+	const T c_gravity_y, const wstGrid3d< T >& grid)
+{
+	const T c_gy = (T) 0.5 * c_gravity_y;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] += c_gy * (X[idx] + X[idx - grid.nz]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_buoyancy(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const X,
+	const T c_gravity_z, const wstGrid3d< T >& grid)
+{
+	const T c_gz = (T)0.5 * c_gravity_z;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] += c_gz *
+					// * linear interpolation -> //
+					//(T)2.0 * (X[idx] * grid.dz[k - 1] + X[idx - 1] * grid.dz[k]) * grid.dzmi[k];
+
+					// * averaging in computational space -> //
+					(X[idx] + X[idx - 1]);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * coriolis * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_coriolis(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] += (T)0.25 * c_coriolis_z * (
+					V[idx] + V[idx + grid.nz] +
+					V[idx - grid.nyz] + V[idx - grid.nyz + grid.nz])
+					-
+					(T)0.25 * c_coriolis_y * (
+					W[idx] + W[idx + 1] +
+					W[idx - grid.nyz] + W[idx - grid.nyz + 1]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_coriolis(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] -= (T)0.25 * c_coriolis_z * (
+					U[idx] + U[idx + grid.nyz] +
+					U[idx - grid.nz] + U[idx + grid.nyz - grid.nz]) 
+					-
+					(T)0.25 * c_coriolis_x * (
+					W[idx] + W[idx + 1] +
+					W[idx - grid.nz] + W[idx - grid.nz + 1]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_coriolis(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] += (T)0.25 * c_coriolis_y * (
+					U[idx] + U[idx + grid.nyz] +
+					U[idx - 1] + U[idx + grid.nyz - 1])
+					-
+					(T)0.25 * c_coriolis_x * (
+					V[idx] + V[idx + grid.nz] +
+					V[idx - 1] + V[idx + grid.nz - 1]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::u_geo_coriolis(T* _RESTRICT Uinterm,
+	const T* _RESTRICT const V, const T V_geo, const T f, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] += (T)0.25 * f * (
+					V[idx] + V[idx + grid.nz] +
+					V[idx - grid.nyz] + V[idx - grid.nyz + grid.nz]);
+
+				Uinterm[idx] -= f * V_geo;
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_geo_coriolis(T* _RESTRICT Vinterm,
+	const T* _RESTRICT const U, const T U_geo, const T f, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] -= (T)0.25 * f * (
+					U[idx] + U[idx + grid.nyz] +
+					U[idx - grid.nz] + U[idx + grid.nyz - grid.nz]);
+
+				Vinterm[idx] += f * U_geo;
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * coriolis * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::u_set_coriolis(
+	T* _RESTRICT Uinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Uinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Uinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Uinterm[idx] = (T)0.25 * c_coriolis_z * (
+					V[idx] + V[idx + grid.nz] +
+					V[idx - grid.nyz] + V[idx - grid.nyz + grid.nz])
+					-
+					(T)0.25 * c_coriolis_y * (
+					W[idx] + W[idx + 1] +
+					W[idx - grid.nyz] + W[idx - grid.nyz + 1]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::v_set_coriolis(
+	T* _RESTRICT Vinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Vinterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Vinterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vinterm[idx] = - (T)0.25 * c_coriolis_z * (
+					U[idx] + U[idx + grid.nyz] +
+					U[idx - grid.nz] + U[idx + grid.nyz - grid.nz]) 
+					+
+					(T)0.25 * c_coriolis_x * (
+					W[idx] + W[idx + 1] +
+					W[idx - grid.nz] + W[idx - grid.nz + 1]);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::w_set_coriolis(
+	T* _RESTRICT Winterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Winterm ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Winterm )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Winterm[idx] = (T)0.25 * c_coriolis_y * (
+					U[idx] + U[idx + grid.nyz] +
+					U[idx - 1] + U[idx + grid.nyz - 1])
+					-
+					(T)0.25 * c_coriolis_x * (
+					V[idx] + V[idx + grid.nz] +
+					V[idx - 1] + V[idx + grid.nz - 1]);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * kinetic energy * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+T nse::kinetic_energy(
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	T ke_sum = (T)0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) reduction( + : ke_sum ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) reduction( + : ke_sum )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+				ke_sum +=
+					(
+					(U[idx] * U[idx] + U[idx + grid.nyz] * U[idx + grid.nyz]) +
+					(V[idx] * V[idx] + V[idx + grid.nz] * V[idx + grid.nz]) +
+					(W[idx] * W[idx] + W[idx + 1] * W[idx + 1])
+					) * grid.dz[k];
+			}
+		}
+	}
+
+	mpi_allreduce(&ke_sum, MPI_SUM, grid.mpi_com.comm);
+	return (T) 0.25 * ke_sum * grid.dx * grid.dy;
+}
+// ------------------------------------------------------------------------ //
+
+// * vorticity * //
+// ------------------------------------------------------------------------ //
+template< typename T >
+void nse::vorticity(T* _RESTRICT Vorticity_x, T* _RESTRICT Vorticity_y, T* _RESTRICT Vorticity_z,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) \
+	shared( Vorticity_x, Vorticity_y, Vorticity_z ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) \
+	shared( Vorticity_x, Vorticity_y, Vorticity_z )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				Vorticity_x[idx] =
+					(W[idx] - W[idx - grid.nz]) * grid.dyi -
+					(V[idx] - V[idx - 1]) * (T)2.0 * grid.dzmi[k];
+
+				Vorticity_y[idx] =
+					(U[idx] - U[idx - 1]) * (T)2.0 * grid.dzmi[k] -
+					(W[idx] - W[idx - grid.nyz]) * grid.dxi;
+
+				Vorticity_z[idx] =
+					(V[idx] - V[idx - grid.nyz]) * grid.dxi -
+					(U[idx] - U[idx - grid.nz]) * grid.dyi;
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * special field products * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// = W * (dU/dz) [node: UW]
+void nse::uw_advection(T* _RESTRICT Xinterm,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeUW) {	// 0.5 * (W[ijk] + W[i-1jk]) * (U[ijk] - U[ijk-1]) / dz
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Xinterm[idx] = (W[idx - grid.nyz] + W[idx]) *
+						(U[idx] - U[idx - 1]) * grid.dzih[k];
+
+					// approximation is based on ADV. scheme
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = W * (dV/dz) [node: VW]
+void nse::vw_advection(T* _RESTRICT Xinterm,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeVW) {	// 0.5 * (W[ijk] + W[ij-1k]) * (V[ijk] - V[ijk-1]) / dz
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Xinterm[idx] = (W[idx - grid.nz] + W[idx]) *
+						(V[idx] - V[idx - 1]) * grid.dzih[k];
+
+					// approximation is based on ADV. scheme
+				}
+			}
+		}
+		return;
+	}
+}
+
+template< typename T >	// = W * (dC/dz) [node: W]
+void nse::cw_advection(T* _RESTRICT Xinterm,
+	const T* _RESTRICT const C, const T* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	if (node == nodeW) {	// W[ijk] * (C[ijk] - C[ijk-1]) / dz
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( Xinterm ) collapse( 2 )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+			{
+				idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( Xinterm )
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+#endif
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					Xinterm[idx] = W[idx] *
+						(C[idx] - C[idx - 1]) * grid.dzi[k];
+
+					// approximation is based on ADV. scheme
+				}
+			}
+		}
+		return;
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * scalar-pressure gradient * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// [ C*dP/dx ] [-> node: U]
+void nse::c_u_pressure_gradient(T* _RESTRICT C_dPdx,
+	const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C_dPdx ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C_dPdx )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				C_dPdx[idx] = (X[idx] + X[idx - grid.nyz]) *
+					((Pressure[idx] - Pressure[idx - grid.nyz]) * grid.dxih);
+			}
+		}
+	}
+}
+
+template< typename T >	// [ C*dP/dy ] [-> node: V]
+void nse::c_v_pressure_gradient(T* _RESTRICT C_dPdy,
+	const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C_dPdy ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C_dPdy )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				C_dPdy[idx] = (X[idx] + X[idx - grid.nz]) *
+					((Pressure[idx] - Pressure[idx - grid.nz]) * grid.dyih);
+			}
+		}
+	}
+}
+
+template< typename T >	// [ C*dP/dz ] [-> node: W]
+void nse::c_w_pressure_gradient(T* _RESTRICT C_dPdz,
+	const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( C_dPdz ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( C_dPdz )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				C_dPdz[idx] = (X[idx] + X[idx - 1]) *
+					((Pressure[idx] - Pressure[idx - 1]) * grid.dzmi[k]);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * pressure-strain tensor * //
+// ------------------------------------------------------------------------ //
+template< typename T >	// [ P*(dU/dx), P*(dV/dy), P*(dW/dz) ] [-> node: C]
+void nse::pressure_strain_diag(T* _RESTRICT PU, T* _RESTRICT PV, T* _RESTRICT PW,
+	const T* _RESTRICT const Pressure,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// Pressure[ijk] * U-Div[ijk]
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( PU, PV, PW ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( PU, PV, PW )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				PU[idx] = Pressure[idx] * (
+					(U[idx + grid.nyz] - U[idx]) * grid.dxi);
+				PV[idx] = Pressure[idx] * (
+					(V[idx + grid.nz] - V[idx]) * grid.dyi);
+				PW[idx] = Pressure[idx] * (
+					(W[idx + 1] - W[idx]) * grid.dzi[k]);
+			}
+		}
+	}
+}
+
+template< typename T >	// = P * 2 * S[u,v] = P * (du/dy + dv/dx) [node: UV]
+void nse::pressure_strain_uv(T* _RESTRICT PS_UV,
+	const T* _RESTRICT const Pressure,
+	const T* _RESTRICT const U, const T* _RESTRICT const V,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-x,-y] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( PS_UV ) collapse( 2 )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( PS_UV )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				PS_UV[idx] = (
+					Pressure[idx] + Pressure[idx - grid.nyz] +
+					Pressure[idx - grid.nz] + Pressure[idx - grid.nyz - grid.nz]) *
+					(
+					// uv: p*du/dy //
+					(U[idx] - U[idx - grid.nz]) * grid.dyiq +
+					// vu: p*dv/dx //
+					(V[idx] - V[idx - grid.nyz]) * grid.dxiq
+					);
+			}
+		}
+	}
+}
+
+template< typename T >	// = P * 2 * S[u,w] = P * (du/dz + dw/dx) [node: UW]
+void nse::pressure_strain_uw(T* _RESTRICT PS_UW,
+	const T* _RESTRICT const Pressure,
+	const T* _RESTRICT const U, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-x,-z] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( PS_UW ) collapse( 2 )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( PS_UW )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+				PS_UW[idx] = (
+					Pressure[idx] + Pressure[idx - grid.nyz] +
+					Pressure[idx - 1] + Pressure[idx - grid.nyz - 1]) *
+					(
+					// uw: p*du/dz //
+					(U[idx] - U[idx - 1]) * grid.dzmih[k] +
+					// wu: p*dw/dx //
+					(W[idx] - W[idx - grid.nyz]) * grid.dxiq
+					);
+			}
+		}
+	}
+}
+
+template< typename T >	// = P * 2 * S[v,w] = P * (dv/dz + dw/dy) [node: VW]
+void nse::pressure_strain_vw(T* _RESTRICT PS_VW,
+	const T* _RESTRICT const Pressure,
+	const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-y,-z] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( PS_VW ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( PS_VW )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+				PS_VW[idx] = (
+					Pressure[idx] + Pressure[idx - grid.nz] +
+					Pressure[idx - 1] + Pressure[idx - grid.nz - 1]) *
+					(
+					// vw: p*dv/dz //
+					(V[idx] - V[idx - 1]) * grid.dzmih[k] +
+					// wv: p*dw/dy //
+					(W[idx] - W[idx - grid.nz]) * grid.dyiq
+					);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * momentum fluxes: u_i * u_j = 2 * K_m * S_ij * //
+template< typename T >
+void nse::uv_momentum_flux(T* _RESTRICT UV,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const Km,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-x,-y] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UV ) collapse( 2 )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UV )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				T Km_ijk = (T)0.25 * (Km[idx] + Km[idx - grid.nyz] + Km[idx - grid.nz] + Km[idx - grid.nyz - grid.nz]);
+				UV[idx] = - Km_ijk * (
+					(U[idx] - U[idx - grid.nz]) * grid.dyi +
+					(V[idx] - V[idx - grid.nyz]) * grid.dxi);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uw_momentum_flux(T* _RESTRICT UW,
+	const T* _RESTRICT const U, const T* _RESTRICT const W, const T* _RESTRICT const Km,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-x,-z] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( UW ) collapse( 2 )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( UW )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+				T Km_ijk = (T)0.25 * (Km[idx] + Km[idx - grid.nyz] + Km[idx - 1] + Km[idx - grid.nyz - 1]);
+				UW[idx] = - Km_ijk * (
+					(T)2.0 * (U[idx] - U[idx - 1]) * grid.dzmi[k] +
+					(W[idx] - W[idx - grid.nyz]) * grid.dxi);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::vw_momentum_flux(T* _RESTRICT VW,
+	const T* _RESTRICT const V, const T* _RESTRICT const W, const T* _RESTRICT const Km,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-y,-z] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( VW ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( VW )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+				T Km_ijk = (T)0.25 * (Km[idx] + Km[idx - grid.nz] + Km[idx - 1] + Km[idx - grid.nz - 1]);
+				VW[idx] = - Km_ijk * (
+					(T)2.0 * (V[idx] - V[idx - 1]) * grid.dzmi[k] +
+					(W[idx] - W[idx - grid.nz]) * grid.dyi);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::uw_momentum_flux(T* _RESTRICT UW,
+	const T* _RESTRICT const U, const T* _RESTRICT const W, const T* _RESTRICT const Km,
+	const T* _RESTRICT const uw_momentum_flux0, const T* _RESTRICT const uw_momentum_fluxH,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, pidx, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-x,-z] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, pidx, idx ) shared( UW ) collapse( 2 )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, pidx, shidx, idx ) shared( UW )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			pidx = i * grid.ny + j;
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+				// --- top wall
+				if ((grid.mpi_com.rank_z == grid.mpi_com.size_z - 1) &&
+					(k == grid.nz - grid.gcz))
+				{
+					UW[idx] = - uw_momentum_fluxH[pidx];
+					continue;
+				}
+				// --- bottom wall
+				if ((grid.mpi_com.rank_z == 0) && (k == grid.gcz))
+				{
+					UW[idx] = - uw_momentum_flux0[pidx];
+					continue;
+				}
+
+				T Km_ijk = (T)0.25 * (Km[idx] + Km[idx - grid.nyz] + Km[idx - 1] + Km[idx - grid.nyz - 1]);
+				UW[idx] = - Km_ijk * (
+					(T)2.0 * (U[idx] - U[idx - 1]) * grid.dzmi[k] +
+					(W[idx] - W[idx - grid.nyz]) * grid.dxi);
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::vw_momentum_flux(T* _RESTRICT VW,
+	const T* _RESTRICT const V, const T* _RESTRICT const W, const T* _RESTRICT const Km,
+	const T* _RESTRICT const vw_momentum_flux0, const T* _RESTRICT const vw_momentum_fluxH,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, pidx, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+	// -- including last node in [-y,-z] axis
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, pidx, idx ) shared( VW ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, pidx, shidx, idx ) shared( VW )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			pidx = i * grid.ny + j;
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) 
+			{
+				// --- top wall
+				if ((grid.mpi_com.rank_z == grid.mpi_com.size_z - 1) &&
+					(k == grid.nz - grid.gcz))
+				{
+					VW[idx] = - vw_momentum_fluxH[pidx];
+					continue;
+				}
+				// --- bottom wall
+				if ((grid.mpi_com.rank_z == 0) && (k == grid.gcz))
+				{
+					VW[idx] = - vw_momentum_flux0[pidx];
+					continue;
+				}
+
+				T Km_ijk = (T)0.25 * (Km[idx] + Km[idx - grid.nz] + Km[idx - 1] + Km[idx - grid.nz - 1]);
+				VW[idx] = - Km_ijk * (
+					(T)2.0 * (V[idx] - V[idx - 1]) * grid.dzmi[k] +
+					(W[idx] - W[idx - grid.nz]) * grid.dyi);
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * scalar fluxes: u_i * C = K_h * dC/dxi * //
+template< typename T >
+void nse::cu_flux(T* _RESTRICT CU,
+	const T* _RESTRICT const C, const T* _RESTRICT const Kh,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CU ) collapse( 2 )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CU )
+	for (i = grid.gcx; i <= grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				T Kh_ijk = (T)0.5 * (Kh[idx] + Kh[idx - grid.nyz]);
+				CU[idx] = - Kh_ijk * (C[idx] - C[idx - grid.nyz]) * grid.dxi;
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::cv_flux(T* _RESTRICT CV,
+	const T* _RESTRICT const C, const T* _RESTRICT const Kh,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CV ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CV )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j <= grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				T Kh_ijk = (T)0.5 * (Kh[idx] + Kh[idx - grid.nz]);
+				CV[idx] = - Kh_ijk * (C[idx] - C[idx - grid.nz]) * grid.dyi;
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::cw_flux(T* _RESTRICT CW,
+	const T* _RESTRICT const C, const T* _RESTRICT const Kh,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, idx ) shared( CW ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, idx ) shared( CW )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) {
+
+				T Kh_ijk = (T)0.5 * (Kh[idx] + Kh[idx - 1]);
+				CW[idx] = - Kh_ijk * (C[idx] - C[idx - 1]) * (T)2.0 * grid.dzmi[k];
+			}
+		}
+	}
+}
+
+template< typename T >
+void nse::cw_flux(T* _RESTRICT CW,
+	const T* _RESTRICT const C, const T* _RESTRICT const Kh,
+	const T* _RESTRICT const flux0, const T* _RESTRICT const fluxH,
+	const wstGrid3d< T >& grid)
+{
+	int i, j, k, pidx, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, pidx, idx ) shared( CW ) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#pragma omp parallel for private( i, j, k, pidx, shidx, idx ) shared( CW )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			pidx = i * grid.ny + j;
+			for (k = grid.gcz; k <= grid.nz - grid.gcz; k++, idx++) 
+			{
+				// --- top wall
+				if ((grid.mpi_com.rank_z == grid.mpi_com.size_z - 1) &&
+					(k == grid.nz - grid.gcz))
+				{
+					CW[idx] = - fluxH[pidx];
+					continue;
+				}
+				// --- bottom wall
+				if ((grid.mpi_com.rank_z == 0) && (k == grid.gcz))
+				{
+					CW[idx] = - flux0[pidx];
+					continue;
+				}
+
+
+				T Kh_ijk = (T)0.5 * (Kh[idx] + Kh[idx - 1]);
+				CW[idx] = - Kh_ijk * (C[idx] - C[idx - 1]) * (T)2.0 * grid.dzmi[k];
+			}
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * CFL estimate * //
+template< typename T >
+void nse::get_CFL(T* _RESTRICT u_CFL_max, T* _RESTRICT v_CFL_max, T* _RESTRICT w_CFL_max,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const wstGrid3d< T >& grid, const T dt)
+{
+	int i, j, k, shidx, idx;
+	T u_cfl_sh = (T)0, v_cfl_sh = (T)0, w_cfl_sh = (T)0;
+
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+
+	T u_cfl_local, v_cfl_local, w_cfl_local;
+
+#pragma omp parallel private(i, j, k, shidx, idx, u_cfl_local, v_cfl_local, w_cfl_local) shared(u_cfl_sh, v_cfl_sh, w_cfl_sh)
+	{
+		u_cfl_local = (T)0;
+		v_cfl_local = (T)0;
+		w_cfl_local = (T)0;
+
+#pragma omp for nowait
+		for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+		{
+			shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+			for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+			{
+				idx = shidx;
+				for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+					T u_cfl_p = dt * (T)0.5 * fabs(U[idx] + U[idx + grid.nyz]) / grid.dx;
+					T v_cfl_p = dt * (T)0.5 * fabs(V[idx] + V[idx + grid.nz]) / grid.dy;
+					T w_cfl_p = dt * (T)0.5 * fabs(W[idx] + W[idx + 1]) / grid.dz[k];
+
+					if (u_cfl_p > u_cfl_local) u_cfl_local = u_cfl_p;
+					if (v_cfl_p > v_cfl_local) v_cfl_local = v_cfl_p;
+					if (w_cfl_p > w_cfl_local) w_cfl_local = w_cfl_p;
+				}
+			}
+		}
+
+#pragma omp critical
+		{
+			if (u_cfl_local > u_cfl_sh) u_cfl_sh = u_cfl_local;
+			if (v_cfl_local > v_cfl_sh) v_cfl_sh = v_cfl_local;
+			if (w_cfl_local > w_cfl_sh) w_cfl_sh = w_cfl_local;
+		}
+	}
+
+#else
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private(i, j, k, idx) reduction(max: u_cfl_sh, v_cfl_sh, w_cfl_sh) collapse( 2 )
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++)
+		{
+			idx = i * grid.nyz + j * grid.nz + grid.gcz;
+#else
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i, j, k, shidx, idx) reduction(max: u_cfl_sh, v_cfl_sh, w_cfl_sh)
+#endif
+	for (i = grid.gcx; i < grid.nx - grid.gcx; i++)
+	{
+		shidx = i * grid.nyz + grid.gcy * grid.nz + grid.gcz;
+		for (j = grid.gcy; j < grid.ny - grid.gcy; j++, shidx += grid.nz)
+		{
+			idx = shidx;
+#endif
+			for (k = grid.gcz; k < grid.nz - grid.gcz; k++, idx++) {
+
+				T u_cfl_p = dt * (T)0.5 * fabs(U[idx] + U[idx + grid.nyz]) / grid.dx;
+				T v_cfl_p = dt * (T)0.5 * fabs(V[idx] + V[idx + grid.nz]) / grid.dy;
+				T w_cfl_p = dt * (T)0.5 * fabs(W[idx] + W[idx + 1]) / grid.dz[k];
+
+				if (u_cfl_p > u_cfl_sh) u_cfl_sh = u_cfl_p;
+				if (v_cfl_p > v_cfl_sh) v_cfl_sh = v_cfl_p;
+				if (w_cfl_p > w_cfl_sh) w_cfl_sh = w_cfl_p;
+			}
+		}
+	}
+
+#endif
+
+	mpi_allreduce(&u_cfl_sh, &v_cfl_sh, &w_cfl_sh, MPI_MAX, grid.mpi_com.comm);
+
+	(*u_CFL_max) = u_cfl_sh;
+	(*v_CFL_max) = v_cfl_sh;
+	(*w_CFL_max) = w_cfl_sh;
+}
+// ------------------------------------------------------------------------- //
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+
+// * initialize: advection * //
+template void nse::u_advection_div(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::u_advection_div(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::v_advection_div(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::v_advection_div(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::w_advection_div(float* _RESTRICT Winterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::w_advection_div(double* _RESTRICT Winterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::u_advection_skew(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::u_advection_skew(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::v_advection_skew(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::v_advection_skew(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::w_advection_skew(float* _RESTRICT Winterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::w_advection_skew(double* _RESTRICT Winterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: scalar advection * //
+template void nse::c_advection_div(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float* _RESTRICT const X,
+	const wstGrid3d< float >& grid);
+template void nse::c_advection_div(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double* _RESTRICT const X,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_advection_adv(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float* _RESTRICT const X,
+	const wstGrid3d< float >& grid);
+template void nse::c_advection_adv(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double* _RESTRICT const X,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_advection_skew(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float* _RESTRICT const X,
+	const wstGrid3d< float >& grid);
+template void nse::c_advection_skew(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double* _RESTRICT const X,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_advection_div_vline(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float X0, const float XH,
+	const wstGrid3d< float >& grid);
+template void nse::c_advection_div_vline(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double X0, const double XH,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_advection_skew_vline(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float X0, const float XH,
+	const wstGrid3d< float >& grid);
+template void nse::c_advection_skew_vline(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double X0, const double XH,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: diffusion * //
+template void nse::u_add_diffusion(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::u_add_diffusion(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::v_add_diffusion(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const V,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::v_add_diffusion(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const V,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::w_add_diffusion(float* _RESTRICT Winterm,
+	const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::w_add_diffusion(double* _RESTRICT Winterm,
+	const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::c_add_diffusion(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const X,
+	const float c_diffusivity, const wstGrid3d< float >& grid);
+template void nse::c_add_diffusion(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const X,
+	const double c_diffusivity, const wstGrid3d< double >& grid);
+
+
+template void nse::u_set_diffusion(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::u_set_diffusion(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::v_set_diffusion(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const V,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::v_set_diffusion(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const V,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::w_set_diffusion(float* _RESTRICT Winterm,
+	const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::w_set_diffusion(double* _RESTRICT Winterm,
+	const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::c_set_diffusion(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const X,
+	const float c_diffusivity, const wstGrid3d< float >& grid);
+template void nse::c_set_diffusion(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const X,
+	const double c_diffusivity, const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: dissipation operator * //
+template void nse::u_dissipation(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::u_dissipation(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::v_dissipation(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const V,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::v_dissipation(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const V,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::w_dissipation(float* _RESTRICT Winterm,
+	const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::w_dissipation(double* _RESTRICT Winterm,
+	const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::c_dissipation(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const X,
+	const float c_diffusivity, const wstGrid3d< float >& grid);
+template void nse::c_dissipation(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const X,
+	const double c_diffusivity, const wstGrid3d< double >& grid);
+
+
+template void nse::uw_dissipation(float* _RESTRICT UWinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const float* _RESTRICT const U_diffusion,
+	const float* _RESTRICT const W_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::uw_dissipation(double* _RESTRICT UWinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const double* _RESTRICT const U_diffusion,
+	const double* _RESTRICT const W_diffusion,
+	const wstGrid3d< double >& grid);
+
+template void nse::vw_dissipation(float* _RESTRICT VWinterm,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float* _RESTRICT const V_diffusion,
+	const float* _RESTRICT const W_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::vw_dissipation(double* _RESTRICT VWinterm,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double* _RESTRICT const V_diffusion,
+	const double* _RESTRICT const W_diffusion,
+	const wstGrid3d< double >& grid);
+
+template void nse::uv_dissipation(float* _RESTRICT UVinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V,
+	const float* _RESTRICT const U_diffusion,
+	const float* _RESTRICT const V_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::uv_dissipation(double* _RESTRICT UVinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V,
+	const double* _RESTRICT const U_diffusion,
+	const double* _RESTRICT const V_diffusion,
+	const wstGrid3d< double >& grid);
+
+template void nse::cu_dissipation(float* _RESTRICT CUinterm,
+	const float* _RESTRICT const X, const float* _RESTRICT const U,
+	const float* _RESTRICT const X_diffusion,
+	const float* _RESTRICT const U_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::cu_dissipation(double* _RESTRICT CUinterm,
+	const double* _RESTRICT const X, const double* _RESTRICT const U,
+	const double* _RESTRICT const X_diffusion,
+	const double* _RESTRICT const U_diffusion,
+	const wstGrid3d< double >& grid);
+
+template void nse::cv_dissipation(float* _RESTRICT CVinterm,
+	const float* _RESTRICT const X, const float* _RESTRICT const V,
+	const float* _RESTRICT const X_diffusion,
+	const float* _RESTRICT const V_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::cv_dissipation(double* _RESTRICT CVinterm,
+	const double* _RESTRICT const X, const double* _RESTRICT const V,
+	const double* _RESTRICT const X_diffusion,
+	const double* _RESTRICT const V_diffusion,
+	const wstGrid3d< double >& grid);
+
+template void nse::cw_dissipation(float* _RESTRICT CWinterm,
+	const float* _RESTRICT const X, const float* _RESTRICT const W,
+	const float* _RESTRICT const X_diffusion,
+	const float* _RESTRICT const W_diffusion,
+	const wstGrid3d< float >& grid);
+template void nse::cw_dissipation(double* _RESTRICT CWinterm,
+	const double* _RESTRICT const X, const double* _RESTRICT const W,
+	const double* _RESTRICT const X_diffusion,
+	const double* _RESTRICT const W_diffusion, 
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: isotropic dissipation operator * //
+template void nse::u_iso_dissipation(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::u_iso_dissipation(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::v_iso_dissipation(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const V,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::v_iso_dissipation(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const V,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::w_iso_dissipation(float* _RESTRICT Winterm,
+	const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::w_iso_dissipation(double* _RESTRICT Winterm,
+	const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::c_iso_dissipation(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const X,
+	const float c_diffusivity, const wstGrid3d< float >& grid);
+template void nse::c_iso_dissipation(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const X,
+	const double c_diffusivity, const wstGrid3d< double >& grid);
+
+template void nse::uv_iso_dissipation_components(
+	float* _RESTRICT UVinterm_x, float* _RESTRICT UVinterm_y, float* _RESTRICT UVinterm_z,
+	const float* _RESTRICT const U, const float* _RESTRICT const V,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::uv_iso_dissipation_components(
+	double* _RESTRICT UVinterm_x, double* _RESTRICT UVinterm_y, double* _RESTRICT UVinterm_z,
+	const double* _RESTRICT const U, const double* _RESTRICT const V,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::uw_iso_dissipation_components(
+	float* _RESTRICT UWinterm_x, float* _RESTRICT UWinterm_y, float* _RESTRICT UWinterm_z,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::uw_iso_dissipation_components(
+	double* _RESTRICT UWinterm_x, double* _RESTRICT UWinterm_y, double* _RESTRICT UWinterm_z,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::vw_iso_dissipation_components(
+	float* _RESTRICT VWinterm_x, float* _RESTRICT VWinterm_y, float* _RESTRICT VWinterm_z,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_viscosity, const wstGrid3d< float >& grid);
+template void nse::vw_iso_dissipation_components(
+	double* _RESTRICT VWinterm_x, double* _RESTRICT VWinterm_y, double* _RESTRICT VWinterm_z,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_viscosity, const wstGrid3d< double >& grid);
+
+template void nse::uv_iso_dissipation(float* _RESTRICT UVinterm,
+	const float* _RESTRICT const UVinterm_x,
+	const float* _RESTRICT const UVinterm_y,
+	const float* _RESTRICT const UVinterm_z,
+	const wstGrid3d< float >& grid);
+template void nse::uv_iso_dissipation(double* _RESTRICT UVinterm,
+	const double* _RESTRICT const UVinterm_x,
+	const double* _RESTRICT const UVinterm_y,
+	const double* _RESTRICT const UVinterm_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::uw_iso_dissipation(float* _RESTRICT UWinterm,
+	const float* _RESTRICT const UWinterm_x,
+	const float* _RESTRICT const UWinterm_y,
+	const float* _RESTRICT const UWinterm_z,
+	const wstGrid3d< float >& grid);
+template void nse::uw_iso_dissipation(double* _RESTRICT UWinterm,
+	const double* _RESTRICT const UWinterm_x,
+	const double* _RESTRICT const UWinterm_y,
+	const double* _RESTRICT const UWinterm_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::vw_iso_dissipation(float* _RESTRICT VWinterm,
+	const float* _RESTRICT const VWinterm_x,
+	const float* _RESTRICT const VWinterm_y,
+	const float* _RESTRICT const VWinterm_z,
+	const wstGrid3d< float >& grid);
+template void nse::vw_iso_dissipation(double* _RESTRICT VWinterm,
+	const double* _RESTRICT const VWinterm_x,
+	const double* _RESTRICT const VWinterm_y,
+	const double* _RESTRICT const VWinterm_z,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: divergence * //
+template void nse::divergence(float* _RESTRICT Div,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::divergence(double* _RESTRICT Div,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: gradient * //
+template void nse::u_sub_gradient(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const X,
+	const float c_gradient, const wstGrid3d< float >& grid);
+template void nse::u_sub_gradient(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const X,
+	const double c_gradient, const wstGrid3d< double >& grid);
+
+template void nse::v_sub_gradient(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const X,
+	const float c_gradient, const wstGrid3d< float >& grid);
+template void nse::v_sub_gradient(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const X,
+	const double c_gradient, const wstGrid3d< double >& grid);
+
+template void nse::w_sub_gradient(float* _RESTRICT Winterm,
+	const float* _RESTRICT const X,
+	const float c_gradient, const wstGrid3d< float >& grid);
+template void nse::w_sub_gradient(double* _RESTRICT Winterm,
+	const double* _RESTRICT const X,
+	const double c_gradient, const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: poisson eq. rhs * //
+template void nse::poisson_rhs(float* _RESTRICT Rhs,
+	const float* _RESTRICT const Div,
+	const float* _RESTRICT const Uinterm, const float* _RESTRICT const Vinterm, const float* _RESTRICT const Winterm,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::poisson_rhs(double* _RESTRICT Rhs,
+	const double* _RESTRICT const Div,
+	const double* _RESTRICT const Uinterm, const double* _RESTRICT const Vinterm, const double* _RESTRICT const Winterm,
+	const wstGrid3d< double >& grid, const double dt);
+
+template void nse::poisson_rhs(float* _RESTRICT Rhs,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float* _RESTRICT const Uinterm, const float* _RESTRICT const Vinterm, const float* _RESTRICT const Winterm,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::poisson_rhs(double* _RESTRICT Rhs,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double* _RESTRICT const Uinterm, const double* _RESTRICT const Vinterm, const double* _RESTRICT const Winterm,
+	const wstGrid3d< double >& grid, const double dt);
+// ------------------------------------------------------------------------ //
+
+// * initialize: projection * //
+template void nse::u_projection(float* _RESTRICT U,
+	const float* _RESTRICT const Uinterm, const float* _RESTRICT const Phi,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::u_projection(double* _RESTRICT U,
+	const double* _RESTRICT const Uinterm, const double* _RESTRICT const Phi,
+	const wstGrid3d< double >& grid, const double dt);
+
+template void nse::v_projection(float* _RESTRICT V,
+	const float* _RESTRICT const Vinterm, const float* _RESTRICT const Phi,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::v_projection(double* _RESTRICT V,
+	const double* _RESTRICT const Vinterm, const double* _RESTRICT const Phi,
+	const wstGrid3d< double >& grid, const double dt);
+
+template void nse::w_projection(float* _RESTRICT W,
+	const float* _RESTRICT const Winterm, const float* _RESTRICT const Phi,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::w_projection(double* _RESTRICT W,
+	const double* _RESTRICT const Winterm, const double* _RESTRICT const Phi,
+	const wstGrid3d< double >& grid, const double dt);
+// ------------------------------------------------------------------------ //
+
+// * initialize: heat dissipation * //
+template void nse::heat_dissipation(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_dissipation, const wstGrid3d< float >& grid);
+template void nse::heat_dissipation(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_dissipation, const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: buoyancy * //
+template void nse::u_buoyancy(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const X,
+	const float c_gravity_x, const wstGrid3d< float >& grid);
+template void nse::u_buoyancy(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const X,
+	const double c_gravity_x, const wstGrid3d< double >& grid);
+
+template void nse::v_buoyancy(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const X,
+	const float c_gravity_y, const wstGrid3d< float >& grid);
+template void nse::v_buoyancy(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const X,
+	const double c_gravity_y, const wstGrid3d< double >& grid);
+
+template void nse::w_buoyancy(float* _RESTRICT Winterm,
+	const float* _RESTRICT const X,
+	const float c_gravity_z, const wstGrid3d< float >& grid);
+template void nse::w_buoyancy(double* _RESTRICT Winterm,
+	const double* _RESTRICT const X,
+	const double c_gravity_z, const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: coriolis * //
+template void nse::u_coriolis(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_coriolis_x, const float c_coriolis_y, const float c_coriolis_z,
+	const wstGrid3d< float >& grid);
+template void nse::u_coriolis(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_coriolis_x, const double c_coriolis_y, const double c_coriolis_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::v_coriolis(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_coriolis_x, const float c_coriolis_y, const float c_coriolis_z,
+	const wstGrid3d< float >& grid);
+template void nse::v_coriolis(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_coriolis_x, const double c_coriolis_y, const double c_coriolis_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::w_coriolis(float* _RESTRICT Winterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_coriolis_x, const float c_coriolis_y, const float c_coriolis_z,
+	const wstGrid3d< float >& grid);
+template void nse::w_coriolis(double* _RESTRICT Winterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_coriolis_x, const double c_coriolis_y, const double c_coriolis_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::u_geo_coriolis(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const V, const float V_geo, const float f, const wstGrid3d< float >& grid);
+template void nse::u_geo_coriolis(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const V, const double V_geo, const double f, const wstGrid3d< double >& grid);
+
+template void nse::v_geo_coriolis(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const U, const float U_geo, const float f, const wstGrid3d< float >& grid);
+template void nse::v_geo_coriolis(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const U, const double U_geo, const double f, const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: coriolis (set) * //
+template void nse::u_set_coriolis(float* _RESTRICT Uinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_coriolis_x, const float c_coriolis_y, const float c_coriolis_z,
+	const wstGrid3d< float >& grid);
+template void nse::u_set_coriolis(double* _RESTRICT Uinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_coriolis_x, const double c_coriolis_y, const double c_coriolis_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::v_set_coriolis(float* _RESTRICT Vinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_coriolis_x, const float c_coriolis_y, const float c_coriolis_z,
+	const wstGrid3d< float >& grid);
+template void nse::v_set_coriolis(double* _RESTRICT Vinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_coriolis_x, const double c_coriolis_y, const double c_coriolis_z,
+	const wstGrid3d< double >& grid);
+
+template void nse::w_set_coriolis(float* _RESTRICT Winterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const float c_coriolis_x, const float c_coriolis_y, const float c_coriolis_z,
+	const wstGrid3d< float >& grid);
+template void nse::w_set_coriolis(double* _RESTRICT Winterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const double c_coriolis_x, const double c_coriolis_y, const double c_coriolis_z,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: kinetic energy * //
+template float nse::kinetic_energy(
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template double nse::kinetic_energy(
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: vorticity * //
+template void nse::vorticity(
+	float* _RESTRICT Vorticity_x, float* _RESTRICT Vorticity_y, float* _RESTRICT Vorticity_z,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::vorticity(
+	double* _RESTRICT Vorticity_x, double* _RESTRICT Vorticity_y, double* _RESTRICT Vorticity_z,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: special field products * //
+template void nse::uw_advection(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< float >& grid);
+template void nse::uw_advection(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< double >& grid);
+
+template void nse::vw_advection(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< float >& grid);
+template void nse::vw_advection(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< double >& grid);
+
+template void nse::cw_advection(float* _RESTRICT Xinterm,
+	const float* _RESTRICT const C, const float* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< float >& grid);
+template void nse::cw_advection(double* _RESTRICT Xinterm,
+	const double* _RESTRICT const C, const double* _RESTRICT const W,
+	const nse_const3d::nodeType node, const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: scalar-pressure gradient * //
+template void nse::c_u_pressure_gradient(float* _RESTRICT C_dPdx,
+	const float* _RESTRICT const X, const float* _RESTRICT const Pressure,
+	const wstGrid3d< float >& grid);
+template void nse::c_u_pressure_gradient(double* _RESTRICT C_dPdx,
+	const double* _RESTRICT const X, const double* _RESTRICT const Pressure,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_v_pressure_gradient(float* _RESTRICT C_dPdy,
+	const float* _RESTRICT const X, const float* _RESTRICT const Pressure,
+	const wstGrid3d< float >& grid);
+template void nse::c_v_pressure_gradient(double* _RESTRICT C_dPdy,
+	const double* _RESTRICT const X, const double* _RESTRICT const Pressure,
+	const wstGrid3d< double >& grid);
+
+template void nse::c_w_pressure_gradient(float* _RESTRICT C_dPdz,
+	const float* _RESTRICT const X, const float* _RESTRICT const Pressure,
+	const wstGrid3d< float >& grid);
+template void nse::c_w_pressure_gradient(double* _RESTRICT C_dPdz,
+	const double* _RESTRICT const X, const double* _RESTRICT const Pressure,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------- //
+
+// * initialize: pressure-strain tensor * //
+template void nse::pressure_strain_diag(float* _RESTRICT PU, float* _RESTRICT PV, float* _RESTRICT PW,
+	const float* _RESTRICT const Pressure,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::pressure_strain_diag(double* _RESTRICT PU, double* _RESTRICT PV, double* _RESTRICT PW,
+	const double* _RESTRICT const Pressure,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::pressure_strain_uv(float* _RESTRICT PS_UV,
+	const float* _RESTRICT const Pressure,
+	const float* _RESTRICT const U, const float* _RESTRICT const V,
+	const wstGrid3d< float >& grid);
+template void nse::pressure_strain_uv(double* _RESTRICT PS_UV,
+	const double* _RESTRICT const Pressure,
+	const double* _RESTRICT const U, const double* _RESTRICT const V,
+	const wstGrid3d< double >& grid);
+
+template void nse::pressure_strain_uw(float* _RESTRICT PS_UW,
+	const float* _RESTRICT const Pressure,
+	const float* _RESTRICT const U, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::pressure_strain_uw(double* _RESTRICT PS_UW,
+	const double* _RESTRICT const Pressure,
+	const double* _RESTRICT const U, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+
+template void nse::pressure_strain_vw(float* _RESTRICT PS_VW,
+	const float* _RESTRICT const Pressure,
+	const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid);
+template void nse::pressure_strain_vw(double* _RESTRICT PS_VW,
+	const double* _RESTRICT const Pressure,
+	const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * inititalize: momentum fluxes: u_i * u_j = 2 * K_m * S_ij * //
+template void nse::uv_momentum_flux(float* _RESTRICT UV,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const Km,
+	const wstGrid3d< float >& grid);
+template void nse::uv_momentum_flux(double* _RESTRICT UV,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const Km,
+	const wstGrid3d< double >& grid);
+
+template void nse::uw_momentum_flux(float* _RESTRICT UW,
+	const float* _RESTRICT const U, const float* _RESTRICT const W, const float* _RESTRICT const Km,
+	const wstGrid3d< float >& grid);
+template void nse::uw_momentum_flux(double* _RESTRICT UW,
+	const double* _RESTRICT const U, const double* _RESTRICT const W, const double* _RESTRICT const Km,
+	const wstGrid3d< double >& grid);
+
+template void nse::vw_momentum_flux(float* _RESTRICT VW,
+	const float* _RESTRICT const V, const float* _RESTRICT const W, const float* _RESTRICT const Km,
+	const wstGrid3d< float >& grid);
+template void nse::vw_momentum_flux(double* _RESTRICT VW,
+	const double* _RESTRICT const V, const double* _RESTRICT const W, const double* _RESTRICT const Km,
+	const wstGrid3d< double >& grid);
+
+template void nse::uw_momentum_flux(float* _RESTRICT UW,
+	const float* _RESTRICT const U, const float* _RESTRICT const W, const float* _RESTRICT const Km,
+	const float* _RESTRICT const uw_momentum_flux0, const float* _RESTRICT const uw_momentum_fluxH,
+	const wstGrid3d< float >& grid);
+template void nse::uw_momentum_flux(double* _RESTRICT UW,
+	const double* _RESTRICT const U, const double* _RESTRICT const W, const double* _RESTRICT const Km,
+	const double* _RESTRICT const uw_momentum_flux0, const double* _RESTRICT const uw_momentum_fluxH,
+	const wstGrid3d< double >& grid);
+
+template void nse::vw_momentum_flux(float* _RESTRICT VW,
+	const float* _RESTRICT const V, const float* _RESTRICT const W, const float* _RESTRICT const Km,
+	const float* _RESTRICT const vw_momentum_flux0, const float* _RESTRICT const vw_momentum_fluxH,
+	const wstGrid3d< float >& grid);
+template void nse::vw_momentum_flux(double* _RESTRICT VW,
+	const double* _RESTRICT const V, const double* _RESTRICT const W, const double* _RESTRICT const Km,
+	const double* _RESTRICT const vw_momentum_flux0, const double* _RESTRICT const vw_momentum_fluxH,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: scalar fluxes: u_i * C = K_h * dC/dxi * //
+template void nse::cu_flux(float* _RESTRICT CU,
+	const float* _RESTRICT const C, const float* _RESTRICT const Kh,
+	const wstGrid3d< float >& grid);
+template void nse::cu_flux(double* _RESTRICT CU,
+	const double* _RESTRICT const C, const double* _RESTRICT const Kh,
+	const wstGrid3d< double >& grid);
+
+template void nse::cv_flux(float* _RESTRICT CV,
+	const float* _RESTRICT const C, const float* _RESTRICT const Kh,
+	const wstGrid3d< float >& grid);
+template void nse::cv_flux(double* _RESTRICT CV,
+	const double* _RESTRICT const C, const double* _RESTRICT const Kh,
+	const wstGrid3d< double >& grid);
+
+template void nse::cw_flux(float* _RESTRICT CW,
+	const float* _RESTRICT const C, const float* _RESTRICT const Kh,
+	const wstGrid3d< float >& grid);
+template void nse::cw_flux(double* _RESTRICT CW,
+	const double* _RESTRICT const C, const double* _RESTRICT const Kh,
+	const wstGrid3d< double >& grid);
+
+template void nse::cw_flux(float* _RESTRICT CW,
+	const float* _RESTRICT const C, const float* _RESTRICT const Kh,
+	const float* _RESTRICT const flux0, const float* _RESTRICT const fluxH,
+	const wstGrid3d< float >& grid);
+template void nse::cw_flux(double* _RESTRICT CW,
+	const double* _RESTRICT const C, const double* _RESTRICT const Kh,
+	const double* _RESTRICT const flux0, const double* _RESTRICT const fluxH,
+	const wstGrid3d< double >& grid);
+// ------------------------------------------------------------------------ //
+
+// * initialize: CFL estimate * //
+template void nse::get_CFL(float* _RESTRICT u_CFL_max, float* _RESTRICT v_CFL_max, float* _RESTRICT w_CFL_max,
+	const float* _RESTRICT const U, const float* _RESTRICT const V, const float* _RESTRICT const W,
+	const wstGrid3d< float >& grid, const float dt);
+template void nse::get_CFL(double* _RESTRICT u_CFL_max, double* _RESTRICT v_CFL_max, double* _RESTRICT w_CFL_max,
+	const double* _RESTRICT const U, const double* _RESTRICT const V, const double* _RESTRICT const W,
+	const wstGrid3d< double >& grid, const double dt);
+// ------------------------------------------------------------------------ //
diff --git a/nse3d.h b/nse3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ac53215464973e448e98ad0a0f4ec27cb60dd3e
--- /dev/null
+++ b/nse3d.h
@@ -0,0 +1,486 @@
+#pragma once
+
+// [nse3d.h(cpp)]: 3D Navier-Stokes module -X2
+//
+// -------------------------------------------------------------------------------------------- //
+
+
+#include "nse-sys.h"
+#include "wstgrid3d.h"
+
+
+namespace nse
+{
+
+	// * velocity advection [ := - ] * //
+	template< typename T >
+	void u_advection_div(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_advection_div(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_advection_div(T* _RESTRICT Winterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void u_advection_skew(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_advection_skew(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_advection_skew(T* _RESTRICT Winterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * scalar advection [ := - ] * //
+	template< typename T >
+	void c_advection_div(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T* _RESTRICT const X,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void c_advection_adv(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T* _RESTRICT const X,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void c_advection_skew(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T* _RESTRICT const X,
+		const wstGrid3d< T >& grid);
+
+	// * correction for exclusion of vertical profile: + W * //
+	template< typename T >
+	void c_advection_div_vline(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T X0, const T XH,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void c_advection_skew_vline(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T X0, const T XH,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * diffusion [ +=, := ] * //
+	template< typename T >
+	void u_add_diffusion(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_add_diffusion(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const V,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_add_diffusion(T* _RESTRICT Winterm,
+		const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void c_add_diffusion(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const X,
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void u_set_diffusion(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_set_diffusion(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const V,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_set_diffusion(T* _RESTRICT Winterm,
+		const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void c_set_diffusion(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const X,
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * dissipation operator [ := Ui * nu * div(grad(Ui)) ] * //
+	template< typename T >
+	void u_dissipation(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_dissipation(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const V,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_dissipation(T* _RESTRICT Winterm,
+		const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void c_dissipation(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const X,
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void uw_dissipation(T* _RESTRICT UWinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const T* _RESTRICT const U_diffusion,
+		const T* _RESTRICT const W_diffusion,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_dissipation(T* _RESTRICT VWinterm,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T* _RESTRICT const V_diffusion,
+		const T* _RESTRICT const W_diffusion,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uv_dissipation(T* _RESTRICT UVinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V,
+		const T* _RESTRICT const U_diffusion,
+		const T* _RESTRICT const V_diffusion,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cu_dissipation(T* _RESTRICT CUinterm,
+		const T* _RESTRICT const X, const T* _RESTRICT const U,
+		const T* _RESTRICT const X_diffusion,
+		const T* _RESTRICT const U_diffusion,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void cv_dissipation(T* _RESTRICT CVinterm,
+		const T* _RESTRICT const X, const T* _RESTRICT const V,
+		const T* _RESTRICT const X_diffusion,
+		const T* _RESTRICT const V_diffusion,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void cw_dissipation(T* _RESTRICT CWinterm,
+		const T* _RESTRICT const X, const T* _RESTRICT const W,
+		const T* _RESTRICT const X_diffusion, 
+		const T* _RESTRICT const W_diffusion,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * isotropic dissipation operator [ := nu * grad(Ui)*grad(Ui) ] * //
+	template< typename T >
+	void u_iso_dissipation(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_iso_dissipation(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const V,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_iso_dissipation(T* _RESTRICT Winterm,
+		const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+	template< typename T >
+	void c_iso_dissipation(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const X,
+		const T c_diffusivity, const wstGrid3d< T >& grid);
+
+	// := 2 * nu * grad(Ui)*grad(Uj)
+	template< typename T >
+	void uv_iso_dissipation_components(
+		T* _RESTRICT UVinterm_x,			// node: [V]
+		T* _RESTRICT UVinterm_y,			// node: [U]
+		T* _RESTRICT UVinterm_z,			// node: [UVW]
+
+		const T* _RESTRICT const U, const T* _RESTRICT const V,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uw_iso_dissipation_components(
+		T* _RESTRICT UWinterm_x,			// node: [W]
+		T* _RESTRICT UWinterm_y,			// node: [UVW]
+		T* _RESTRICT UWinterm_z,			// node: [U]
+		
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_iso_dissipation_components(
+		T* _RESTRICT VWinterm_x,			// node: [UVW]
+		T* _RESTRICT VWinterm_y,			// node: [W]
+		T* _RESTRICT VWinterm_z,			// node: [V]
+
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_viscosity, const wstGrid3d< T >& grid);
+
+
+	template< typename T >
+	void uv_iso_dissipation(T* _RESTRICT UVinterm,		// node: [UV]
+		const T* _RESTRICT const UVinterm_x,			// node: [V]
+		const T* _RESTRICT const UVinterm_y,			// node: [U]
+		const T* _RESTRICT const UVinterm_z,			// node: [UVW]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uw_iso_dissipation(T* _RESTRICT UWinterm,		// node: [UW]
+		const T* _RESTRICT const UWinterm_x,			// node: [W]
+		const T* _RESTRICT const UWinterm_y,			// node: [UVW]
+		const T* _RESTRICT const UWinterm_z,			// node: [U]
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void vw_iso_dissipation(T* _RESTRICT VWinterm,		// node: [VW]
+		const T* _RESTRICT const VWinterm_x,			// node: [UVW]
+		const T* _RESTRICT const VWinterm_y,			// node: [W]
+		const T* _RESTRICT const VWinterm_z,			// node: [V]
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * divergence [ := + ] * //
+	template< typename T >
+	void divergence(T* _RESTRICT Div,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * gradient [ -=, += ] * //
+	template< typename T >
+	void u_sub_gradient(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const X,
+		const T c_gradient, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_sub_gradient(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const X,
+		const T c_gradient, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_sub_gradient(T* _RESTRICT Winterm,
+		const T* _RESTRICT const X,
+		const T c_gradient, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * poisson equation rhs [ := ] * //
+	template< typename T >
+	void poisson_rhs(T* _RESTRICT Rhs,
+		const T* _RESTRICT const Div,
+		const T* _RESTRICT const Uinterm, const T* _RESTRICT const Vinterm, const T* _RESTRICT const Winterm,
+		const wstGrid3d< T >& grid, const T dt);
+	template< typename T >
+	void poisson_rhs(T* _RESTRICT Rhs,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T* _RESTRICT const Uinterm, const T* _RESTRICT const Vinterm, const T* _RESTRICT const Winterm,
+		const wstGrid3d< T >& grid, const T dt);
+	// -------------------------------------------------------------------- //
+
+	// * velocity projection * //
+	template< typename T >
+	void u_projection(T* _RESTRICT U,
+		const T* _RESTRICT const Uinterm, const T* _RESTRICT const Phi,
+		const wstGrid3d< T >& grid, const T dt);
+	template< typename T >
+	void v_projection(T* _RESTRICT V,
+		const T* _RESTRICT const Vinterm, const T* _RESTRICT const Phi,
+		const wstGrid3d< T >& grid, const T dt);
+	template< typename T >
+	void w_projection(T* _RESTRICT W,
+		const T* _RESTRICT const Winterm, const T* _RESTRICT const Phi,
+		const wstGrid3d< T >& grid, const T dt);
+	// -------------------------------------------------------------------- //
+
+
+	// * heat dissipation [ := ] * //
+	template< typename T >
+	void heat_dissipation(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_dissipation, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * buoyancy  [ += ] * //
+	template< typename T >
+	void u_buoyancy(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const X,
+		const T c_gravity_x, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_buoyancy(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const X,
+		const T c_gravity_y, const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_buoyancy(T* _RESTRICT Winterm,
+		const T* _RESTRICT const X,
+		const T c_gravity_z, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * coriolis [ += ] * //
+	template< typename T >
+	void u_coriolis(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_coriolis(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_coriolis(T* _RESTRICT Winterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void u_geo_coriolis(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const V, const T V_geo, const T f, const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_geo_coriolis(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const U, const T U_geo, const T f, const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+	// * coriolis [ := ] * //
+	template< typename T >
+	void u_set_coriolis(T* _RESTRICT Uinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void v_set_coriolis(T* _RESTRICT Vinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void w_set_coriolis(T* _RESTRICT Winterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const T c_coriolis_x, const T c_coriolis_y, const T c_coriolis_z,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * kinetic energy [ := + ] * //
+	template< typename T >
+	T kinetic_energy(
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * vorticity * //
+	template< typename T >	// -> [VW, UW, UV] nodes
+	void vorticity(T* _RESTRICT Vorticity_x, T* _RESTRICT Vorticity_y, T* _RESTRICT Vorticity_z,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+
+
+	// * special field products * //
+	template< typename T >	// = W * (dU/dz) [node: UW]
+	void uw_advection(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const wstGrid3d< T >& grid);
+	template< typename T >	// = W * (dV/dz) [node: VW]
+	void vw_advection(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const wstGrid3d< T >& grid);
+	template< typename T >	// = W * (dC/dz) [node: W]
+	void cw_advection(T* _RESTRICT Xinterm,
+		const T* _RESTRICT const C, const T* _RESTRICT const W,
+		const nse_const3d::nodeType node, const wstGrid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+
+	// * scalar-pressure gradient * //
+	template< typename T >	// [ C*dP/dx ] [-> node: U]
+	void c_u_pressure_gradient(T* _RESTRICT C_dPdx,
+		const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+		const wstGrid3d< T >& grid);
+	template< typename T >	// [ C*dP/dy ] [-> node: V]
+	void c_v_pressure_gradient(T* _RESTRICT C_dPdy,
+		const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+		const wstGrid3d< T >& grid);
+	template< typename T >	// [ C*dP/dz ] [-> node: W]
+	void c_w_pressure_gradient(T* _RESTRICT C_dPdz,
+		const T* _RESTRICT const X, const T* _RESTRICT const Pressure,
+		const wstGrid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+
+	// * pressure-strain tensor * //
+	template< typename T >	// [ P*(dU/dx), P*(dV/dy), P*(dW/dz) ] [-> node: C]
+	void pressure_strain_diag(T* _RESTRICT PU, T* _RESTRICT PV, T* _RESTRICT PW,
+		const T* _RESTRICT const Pressure,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >	// = P * 2 * S[u,v] = P * (du/dy + dv/dx) [node: UV]
+	void pressure_strain_uv(T* _RESTRICT PS_UV,
+		const T* _RESTRICT const Pressure, const T* _RESTRICT const U, const T* _RESTRICT const V,
+		const wstGrid3d< T >& grid);
+	template< typename T >	// = P * 2 * S[u,w] = P * (du/dz + dw/dx) [node: UW]
+	void pressure_strain_uw(T* _RESTRICT PS_UW,
+		const T* _RESTRICT const Pressure, const T* _RESTRICT const U, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	template< typename T >	// = P * 2 * S[v,w] = P * (dv/dz + dw/dy) [node: VW]
+	void pressure_strain_vw(T* _RESTRICT PS_VW,
+		const T* _RESTRICT const Pressure, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+	// * momentum fluxes: u_i * u_j = 2 * K_m * S_ij * //
+	template< typename T >
+	void uv_momentum_flux(T* _RESTRICT UV,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const Km,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void uw_momentum_flux(T* _RESTRICT UW,
+		const T* _RESTRICT const U, const T* _RESTRICT const W, const T* _RESTRICT const Km,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void vw_momentum_flux(T* _RESTRICT VW,
+		const T* _RESTRICT const V, const T* _RESTRICT const W, const T* _RESTRICT const Km,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void uw_momentum_flux(T* _RESTRICT UW,
+		const T* _RESTRICT const U, const T* _RESTRICT const W, const T* _RESTRICT const Km,
+		const T* _RESTRICT const uw_momentum_flux0, const T* _RESTRICT const uw_momentum_fluxH,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void vw_momentum_flux(T* _RESTRICT VW,
+		const T* _RESTRICT const V, const T* _RESTRICT const W, const T* _RESTRICT const Km,
+		const T* _RESTRICT const vw_momentum_flux0, const T* _RESTRICT const vw_momentum_fluxH,
+		const wstGrid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+	// * scalar fluxes: u_i * C = K_h * dC/dxi * //
+	template< typename T >
+	void cu_flux(T* _RESTRICT CU,
+		const T* _RESTRICT const C, const T* _RESTRICT const Kh,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void cv_flux(T* _RESTRICT CV,
+		const T* _RESTRICT const C, const T* _RESTRICT const Kh,
+		const wstGrid3d< T >& grid);
+	template< typename T >
+	void cw_flux(T* _RESTRICT CW,
+		const T* _RESTRICT const C, const T* _RESTRICT const Kh,
+		const wstGrid3d< T >& grid);
+
+	template< typename T >
+	void cw_flux(T* _RESTRICT CW,
+		const T* _RESTRICT const C, const T* _RESTRICT const Kh,
+		const T* _RESTRICT const flux0, const T* _RESTRICT const fluxH,
+		const wstGrid3d< T >& grid);
+	// ------------------------------------------------------------------------- //
+
+	// * CFL estimate * //
+	template< typename T >
+	void get_CFL(T* _RESTRICT u_CFL_max, T* _RESTRICT v_CFL_max, T* _RESTRICT w_CFL_max,
+		const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+		const wstGrid3d< T >& grid, const T dt);
+	// ------------------------------------------------------------------------- //
+}
diff --git a/pois-base3d-x4.cpp b/pois-base3d-x4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ca4fd1d4f3b7286035727495ac9dfbb64ea3ae1
--- /dev/null
+++ b/pois-base3d-x4.cpp
@@ -0,0 +1,853 @@
+#include "pois-base3d-x4.h"
+#include "pois-bc3d.h"		// using x2 boundary conditions
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+template< typename T >
+void poisson3d::matvec_x4_omp(	// [y = Ax]
+	T* _RESTRICT y, const T* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	const int nyz = ny * nz;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			idx = i * nyz + j * nz + kb;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+			for (k = kb; k <= ke; k++, idx++)
+			{
+				y[idx] =
+					(
+						S11 * (x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) +
+						S22 * (x[idx + 3 * nyz] - x[idx] - x[idx] + x[idx - 3 * nyz]) -
+						S12 * (x[idx + 2 * nyz] - x[idx + nyz] - x[idx - nyz] + x[idx - 2 * nyz])
+						) * dx2i +
+
+						(
+							S11 * (x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) +
+							S22 * (x[idx + 3 * nz] - x[idx] - x[idx] + x[idx - 3 * nz]) -
+							S12 * (x[idx + 2 * nz] - x[idx + nz] - x[idx - nz] + x[idx - 2 * nz])
+							) * dy2i +
+
+							(x[idx + 1] - x[idx]) * dzp2i[k] -
+					(x[idx] - x[idx - 1]) * dzm2i[k];
+			}
+		}
+	}
+		}
+
+template< typename T >
+T poisson3d::matvec_dp_x4_omp(	// [y = Ax], [(y,x)]
+	T* _RESTRICT y, const T* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	const int nyz = ny * nz;
+	const int ksh = (ke - kb + 1) & 1;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	T dp = (T)0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			idx = i * nyz + j * nz + kb;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx;
+#endif
+			for (k = kb; k <= ke - ksh; k += 2, idx += 2)
+			{
+				y[idx] =
+					(
+						S11 * (x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) +
+						S22 * (x[idx + 3 * nyz] - x[idx] - x[idx] + x[idx - 3 * nyz]) -
+						S12 * (x[idx + 2 * nyz] - x[idx + nyz] - x[idx - nyz] + x[idx - 2 * nyz])
+						) * dx2i +
+
+						(
+							S11 * (x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) +
+							S22 * (x[idx + 3 * nz] - x[idx] - x[idx] + x[idx - 3 * nz]) -
+							S12 * (x[idx + 2 * nz] - x[idx + nz] - x[idx - nz] + x[idx - 2 * nz])
+							) * dy2i +
+
+							(x[idx + 1] - x[idx]) * dzp2i[k] -
+					(x[idx] - x[idx - 1]) * dzm2i[k];
+
+				y[idx + 1] =
+					(
+						S11 * (x[idx + nyz + 1] - x[idx + 1] - x[idx + 1] + x[idx - nyz + 1]) +
+						S22 * (x[idx + 3 * nyz + 1] - x[idx + 1] - x[idx + 1] + x[idx - 3 * nyz + 1]) -
+						S12 * (x[idx + 2 * nyz + 1] - x[idx + nyz + 1] - x[idx - nyz + 1] + x[idx - 2 * nyz + 1])
+						) * dx2i +
+
+						(
+							S11 * (x[idx + nz + 1] - x[idx + 1] - x[idx + 1] + x[idx - nz + 1]) +
+							S22 * (x[idx + 3 * nz + 1] - x[idx + 1] - x[idx + 1] + x[idx - 3 * nz + 1]) -
+							S12 * (x[idx + 2 * nz + 1] - x[idx + nz + 1] - x[idx - nz + 1] + x[idx - 2 * nz + 1])
+							) * dy2i +
+
+							(x[idx + 2] - x[idx + 1]) * dzp2i[k + 1] -
+					(x[idx + 1] - x[idx]) * dzm2i[k + 1];
+
+				dp += y[idx] * x[idx] + y[idx + 1] * x[idx + 1];
+			}
+
+			if (ksh) {	// k = ke //
+#ifdef USE_OPENMP_2D_CYCLE
+				idx = i * nyz + j * nz + ke;
+#else
+				idx = shidx + ke - kb;
+#endif
+
+				y[idx] =
+					(
+						S11 * (x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) +
+						S22 * (x[idx + 3 * nyz] - x[idx] - x[idx] + x[idx - 3 * nyz]) -
+						S12 * (x[idx + 2 * nyz] - x[idx + nyz] - x[idx - nyz] + x[idx - 2 * nyz])
+						) * dx2i +
+
+						(
+							S11 * (x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) +
+							S22 * (x[idx + 3 * nz] - x[idx] - x[idx] + x[idx - 3 * nz]) -
+							S12 * (x[idx + 2 * nz] - x[idx + nz] - x[idx - nz] + x[idx - 2 * nz])
+							) * dy2i +
+
+							(x[idx + 1] - x[idx]) * dzp2i[ke] -
+					(x[idx] - x[idx - 1]) * dzm2i[ke];
+
+				dp += y[idx] * x[idx];
+			}
+		}
+	}
+
+	return dp;
+		}
+
+template< typename T >
+T poisson3d::matvec_dp_x4_omp(	// [y = Ax], [(y,z)]
+	T* _RESTRICT y, const T* _RESTRICT const x, const T* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	const int nyz = ny * nz;
+	const int ksh = (ke - kb + 1) & 1;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	T dp = (T)0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			idx = i * nyz + j * nz + kb;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx;
+#endif
+			for (k = kb; k <= ke - ksh; k += 2, idx += 2)
+			{
+				y[idx] =
+					(
+						S11 * (x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) +
+						S22 * (x[idx + 3 * nyz] - x[idx] - x[idx] + x[idx - 3 * nyz]) -
+						S12 * (x[idx + 2 * nyz] - x[idx + nyz] - x[idx - nyz] + x[idx - 2 * nyz])
+						) * dx2i +
+
+						(
+							S11 * (x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) +
+							S22 * (x[idx + 3 * nz] - x[idx] - x[idx] + x[idx - 3 * nz]) -
+							S12 * (x[idx + 2 * nz] - x[idx + nz] - x[idx - nz] + x[idx - 2 * nz])
+							) * dy2i +
+
+							(x[idx + 1] - x[idx]) * dzp2i[k] -
+					(x[idx] - x[idx - 1]) * dzm2i[k];
+
+				y[idx + 1] =
+					(
+						S11 * (x[idx + nyz + 1] - x[idx + 1] - x[idx + 1] + x[idx - nyz + 1]) +
+						S22 * (x[idx + 3 * nyz + 1] - x[idx + 1] - x[idx + 1] + x[idx - 3 * nyz + 1]) -
+						S12 * (x[idx + 2 * nyz + 1] - x[idx + nyz + 1] - x[idx - nyz + 1] + x[idx - 2 * nyz + 1])
+						) * dx2i +
+
+						(
+							S11 * (x[idx + nz + 1] - x[idx + 1] - x[idx + 1] + x[idx - nz + 1]) +
+							S22 * (x[idx + 3 * nz + 1] - x[idx + 1] - x[idx + 1] + x[idx - 3 * nz + 1]) -
+							S12 * (x[idx + 2 * nz + 1] - x[idx + nz + 1] - x[idx - nz + 1] + x[idx - 2 * nz + 1])
+							) * dy2i +
+
+							(x[idx + 2] - x[idx + 1]) * dzp2i[k + 1] -
+					(x[idx + 1] - x[idx]) * dzm2i[k + 1];
+
+				dp += y[idx] * z[idx] + y[idx + 1] * z[idx + 1];
+			}
+
+			if (ksh) {	// k = ke //
+#ifdef USE_OPENMP_2D_CYCLE
+				idx = i * nyz + j * nz + ke;
+#else
+				idx = shidx + ke - kb;
+#endif
+
+				y[idx] =
+					(
+						S11 * (x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) +
+						S22 * (x[idx + 3 * nyz] - x[idx] - x[idx] + x[idx - 3 * nyz]) -
+						S12 * (x[idx + 2 * nyz] - x[idx + nyz] - x[idx - nyz] + x[idx - 2 * nyz])
+						) * dx2i +
+
+						(
+							S11 * (x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) +
+							S22 * (x[idx + 3 * nz] - x[idx] - x[idx] + x[idx - 3 * nz]) -
+							S12 * (x[idx + 2 * nz] - x[idx + nz] - x[idx - nz] + x[idx - 2 * nz])
+							) * dy2i +
+
+							(x[idx + 1] - x[idx]) * dzp2i[ke] -
+					(x[idx] - x[idx - 1]) * dzm2i[ke];
+
+				dp += y[idx] * z[idx];
+			}
+		}
+	}
+
+	return dp;
+		}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+void poisson3d::resvec_x4_omp(	// [y = rhs - Ax]
+	T* _RESTRICT y, const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const T C1 = (T) 9.0 / (T) 8.0,
+		C2 = (T) 1.0 / (T) 24.0;
+	const T S11 = C1 * C1, S22 = C2 * C2,
+		S12 = (T) 2.0 * C1 * C2;
+
+	const int nyz = ny * nz;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			idx = i * nyz + j * nz + kb;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+			for (k = kb; k <= ke; k++, idx++)
+			{
+				y[idx] = rhs[idx] - (
+					(
+						S11 * (x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) +
+						S22 * (x[idx + 3 * nyz] - x[idx] - x[idx] + x[idx - 3 * nyz]) -
+						S12 * (x[idx + 2 * nyz] - x[idx + nyz] - x[idx - nyz] + x[idx - 2 * nyz])
+						) * dx2i +
+
+						(
+							S11 * (x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) +
+							S22 * (x[idx + 3 * nz] - x[idx] - x[idx] + x[idx - 3 * nz]) -
+							S12 * (x[idx + 2 * nz] - x[idx + nz] - x[idx - nz] + x[idx - 2 * nz])
+							) * dy2i +
+
+							(x[idx + 1] - x[idx]) * dzp2i[k] -
+					(x[idx] - x[idx - 1]) * dzm2i[k]);
+			}
+		}
+	}
+		}
+// ------------------------------------------------------------------------ //
+
+
+// * [laplace-X4] for poisson equation with async exchanges * //
+template< typename T >
+void poisson3d::laplace_x4_omp(T* _RESTRICT y,
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	// --------------------------------------------------
+	// NOTE: discrete laplace is 2nd order in -z direction
+	// --------------------------------------------------
+
+	MPI_Request mpi_req[12];
+
+	const int ib = gcx, ie = nx - gcx - 1;
+	const int jb = gcy, je = ny - gcy - 1;
+	const int kb = gcz, ke = nz - gcz - 1;
+
+	put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+		mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+		mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+
+		bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+	mpi_com.push_exchange_cross_halo(x, nx, ny, nz, gcx, gcy, gcz,
+		3, 3, 1, bc.x_periodic, bc.y_periodic, bc.z_periodic, mpi_req);
+
+	matvec_x4_omp(y, x, nx, ny, nz,
+		ib + 3 * bc.p_west, ie - 3 * bc.p_east,
+		jb + 3 * bc.p_south, je - 3 * bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top,
+
+		dx2i, dy2i,
+		dzp2i, dzm2i);
+
+	// finalize -x cross exchanges //
+	mpi_com.pop_exchange_halo_x(x, nx, ny, nz, gcx, gcy, gcz,
+		3, 0, 0, bc.x_periodic, &mpi_req[0]);
+
+	if (bc.p_west)
+		matvec_x4_omp(y, x, nx, ny, nz,
+			ib, ib + 2,
+			jb + 3 * bc.p_south, je - 3 * bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_east)
+		matvec_x4_omp(y, x, nx, ny, nz,
+			ie - 2, ie,
+			jb + 3 * bc.p_south, je - 3 * bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -y cross exchanges //
+	mpi_com.pop_exchange_halo_y(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 3, 0, bc.y_periodic, &mpi_req[4]);
+
+	if (bc.p_south)
+		matvec_x4_omp(y, x, nx, ny, nz,
+			ib, ie,
+			jb, jb + 2,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_north)
+		matvec_x4_omp(y, x, nx, ny, nz,
+			ib, ie,
+			je - 2, je,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -z cross exchanges //
+	mpi_com.pop_exchange_halo_z(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 0, 1, bc.z_periodic, &mpi_req[8]);
+
+	if (bc.p_bottom)
+		matvec_x4_omp(y, x, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb, kb,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_top)
+		matvec_x4_omp(y, x, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke, ke,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+}
+// ------------------------------------------------------------------------ //
+
+
+// * [laplace-X4 + dp] for poisson equation with async exchanges * //
+template< typename T >
+T poisson3d::laplace_dp_x4_omp(T* _RESTRICT y,
+	T* _RESTRICT x, const T* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	// --------------------------------------------------
+	// NOTE: discrete laplace is 2nd order in -z direction
+	// --------------------------------------------------
+
+	T dp = (T)0;
+	MPI_Request mpi_req[12];
+
+	const int ib = gcx, ie = nx - gcx - 1;
+	const int jb = gcy, je = ny - gcy - 1;
+	const int kb = gcz, ke = nz - gcz - 1;
+
+	put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+		mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+		mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+
+		bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+	mpi_com.push_exchange_cross_halo(x, nx, ny, nz, gcx, gcy, gcz,
+		3, 3, 1, bc.x_periodic, bc.y_periodic, bc.z_periodic, mpi_req);
+
+
+	dp = matvec_dp_x4_omp(y, x, z, nx, ny, nz,
+		ib + 3 * bc.p_west, ie - 3 * bc.p_east,
+		jb + 3 * bc.p_south, je - 3 * bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top,
+
+		dx2i, dy2i,
+		dzp2i, dzm2i);
+
+	// finalize -x cross exchanges //
+	mpi_com.pop_exchange_halo_x(x, nx, ny, nz, gcx, gcy, gcz,
+		3, 0, 0, bc.x_periodic, &mpi_req[0]);
+
+	if (bc.p_west)
+		dp += matvec_dp_x4_omp(y, x, z, nx, ny, nz,
+			ib, ib + 2,
+			jb + 3 * bc.p_south, je - 3 * bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_east)
+		dp += matvec_dp_x4_omp(y, x, z, nx, ny, nz,
+			ie - 2, ie,
+			jb + 3 * bc.p_south, je - 3 * bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -y cross exchanges //
+	mpi_com.pop_exchange_halo_y(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 3, 0, bc.y_periodic, &mpi_req[4]);
+
+	if (bc.p_south)
+		dp += matvec_dp_x4_omp(y, x, z, nx, ny, nz,
+			ib, ie,
+			jb, jb + 2,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_north)
+		dp += matvec_dp_x4_omp(y, x, z, nx, ny, nz,
+			ib, ie,
+			je - 2, je,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -z cross exchanges //
+	mpi_com.pop_exchange_halo_z(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 0, 1, bc.z_periodic, &mpi_req[8]);
+
+	if (bc.p_bottom)
+		dp += matvec_dp_x4_omp(y, x, z, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb, kb,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_top)
+		dp += matvec_dp_x4_omp(y, x, z, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke, ke,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	return dp;
+}
+// ------------------------------------------------------------------------ //
+
+// * [laplace-residual-X4] for poisson equation with async exchanges * //
+template< typename T >
+void poisson3d::laplace_residual_x4_omp(T* _RESTRICT y,
+	T* _RESTRICT x, const T* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	// --------------------------------------------------
+	// NOTE: discrete laplace is 2nd order in -z direction
+	// --------------------------------------------------
+
+	MPI_Request mpi_req[12];
+
+	const int ib = gcx, ie = nx - gcx - 1;
+	const int jb = gcy, je = ny - gcy - 1;
+	const int kb = gcz, ke = nz - gcz - 1;
+
+	put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+		mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+		mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+
+		bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+	mpi_com.push_exchange_cross_halo(x, nx, ny, nz, gcx, gcy, gcz,
+		3, 3, 1, bc.x_periodic, bc.y_periodic, bc.z_periodic, mpi_req);
+
+	resvec_x4_omp(y, x, rhs, nx, ny, nz,
+		ib + 3 * bc.p_west, ie - 3 * bc.p_east,
+		jb + 3 * bc.p_south, je - 3 * bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top,
+
+		dx2i, dy2i,
+		dzp2i, dzm2i);
+
+	// finalize -x cross exchanges //
+	mpi_com.pop_exchange_halo_x(x, nx, ny, nz, gcx, gcy, gcz,
+		3, 0, 0, bc.x_periodic, &mpi_req[0]);
+
+	if (bc.p_west)
+		resvec_x4_omp(y, x, rhs, nx, ny, nz,
+			ib, ib + 2,
+			jb + 3 * bc.p_south, je - 3 * bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_east)
+		resvec_x4_omp(y, x, rhs, nx, ny, nz,
+			ie - 2, ie,
+			jb + 3 * bc.p_south, je - 3 * bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -y cross exchanges //
+	mpi_com.pop_exchange_halo_y(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 3, 0, bc.y_periodic, &mpi_req[4]);
+
+	if (bc.p_south)
+		resvec_x4_omp(y, x, rhs, nx, ny, nz,
+			ib, ie,
+			jb, jb + 2,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_north)
+		resvec_x4_omp(y, x, rhs, nx, ny, nz,
+			ib, ie,
+			je - 2, je,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -z cross exchanges //
+	mpi_com.pop_exchange_halo_z(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 0, 1, bc.z_periodic, &mpi_req[8]);
+
+	if (bc.p_bottom)
+		resvec_x4_omp(y, x, rhs, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb, kb,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_top)
+		resvec_x4_omp(y, x, rhs, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke, ke,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+}
+// ------------------------------------------------------------------------ //
+
+// Initialization //
+// ------------------------------------------------------------------------ //
+
+// * initialize: matvec * //
+template void poisson3d::matvec_x4_omp(float* _RESTRICT y,
+	const float* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i);
+template void poisson3d::matvec_x4_omp(double* _RESTRICT y,
+	const double* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i);
+// ------------------------------------------------------------------------ //
+
+template float poisson3d::matvec_dp_x4_omp(float* _RESTRICT y,
+	const float* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i);
+template double poisson3d::matvec_dp_x4_omp(double* _RESTRICT y,
+	const double* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i);
+// ------------------------------------------------------------------------ //
+
+template float poisson3d::matvec_dp_x4_omp(float* _RESTRICT y,
+	const float* _RESTRICT const x, const float* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i);
+template double poisson3d::matvec_dp_x4_omp(double* _RESTRICT y,
+	const double* _RESTRICT const x, const double* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i);
+// ------------------------------------------------------------------------ //
+
+// * initialize: resvec * //
+template void poisson3d::resvec_x4_omp(float* _RESTRICT y,
+	const float* _RESTRICT const x, const float* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i);
+template void poisson3d::resvec_x4_omp(double* _RESTRICT y,
+	const double* _RESTRICT const x, const double* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i);
+// ------------------------------------------------------------------------ //
+
+// * initialize: laplace operator * //
+template void poisson3d::laplace_x4_omp(float* _RESTRICT y,
+	float* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+template void poisson3d::laplace_x4_omp(double* _RESTRICT y,
+	double* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+// ------------------------------------------------------------------------ //
+
+// * initialize: laplace operator & dot product * //
+template float poisson3d::laplace_dp_x4_omp(float* _RESTRICT y,
+	float* _RESTRICT x, const float* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+template double poisson3d::laplace_dp_x4_omp(double* _RESTRICT y,
+	double* _RESTRICT x, const double* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+// ------------------------------------------------------------------------ //
+
+// * initialize: laplace residual * //
+template void poisson3d::laplace_residual_x4_omp(float* _RESTRICT y,
+	float* _RESTRICT x, const float* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+template void poisson3d::laplace_residual_x4_omp(double* _RESTRICT y,
+	double* _RESTRICT x, const double* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+// ------------------------------------------------------------------------ //
diff --git a/pois-base3d-x4.h b/pois-base3d-x4.h
new file mode 100644
index 0000000000000000000000000000000000000000..2afe08748d35d7cdbab991204b1e53c28840e4cb
--- /dev/null
+++ b/pois-base3d-x4.h
@@ -0,0 +1,403 @@
+#pragma once
+
+// [pois-base3d-x4.h]: 3D Poisson[x4] basic operations
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "mpi-com3d.h"
+#include "pois-def3d.h"
+
+
+namespace poisson3d
+{
+	// * matvec * //
+	template< typename T >	// [y = Ax]
+	void matvec_x4(T* _RESTRICT y,
+		const T* _RESTRICT const x,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	template< typename T >	// [y = Ax], [(y,x)]
+	T matvec_dp_x4(T* _RESTRICT y,
+		const T* _RESTRICT const x,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	template< typename T >	// [y = Ax], [(y,z)]
+	T matvec_dp_x4(T* _RESTRICT y,
+		const T* _RESTRICT const x, const T* _RESTRICT const z,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+	// * resvec * //
+	template< typename T >
+	void resvec_x4(T* _RESTRICT y,	// [y = rhs - Ax]
+		const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+	// * laplace * //
+	template< typename T >	// [y = Ax]: includes MPI
+	void laplace_x4(T* _RESTRICT y,
+		T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+
+	template< typename T >	// [y = Ax], [(y,z)]: includes MPI
+	T laplace_dp_x4(T* _RESTRICT y,
+		T* _RESTRICT x, const T* _RESTRICT const z,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+
+	// * laplace residual * //
+	template< typename T >	// [y = rhs - Ax]: includes MPI
+	void laplace_residual_x4(T* _RESTRICT y,
+		T* _RESTRICT x, const T* _RESTRICT const rhs,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// OpenMP //
+namespace poisson3d
+{
+	// * matvec * //
+	template< typename T >	// [y = Ax]
+	void matvec_x4_omp(T* _RESTRICT y,
+		const T* _RESTRICT const x,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	template< typename T >	// [y = Ax], [(y,x)]
+	T matvec_dp_x4_omp(T* _RESTRICT y,
+		const T* _RESTRICT const x,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	template< typename T >	// [y = Ax], [(y,z)]
+	T matvec_dp_x4_omp(T* _RESTRICT y,
+		const T* _RESTRICT const x, const T* _RESTRICT const z,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+	// * resvec * //
+	template< typename T >	// [y = rhs - Ax]
+	void resvec_x4_omp(T* _RESTRICT y,
+		const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+	// * laplace * //
+	template< typename T >	// [y = Ax]: includes MPI
+	void laplace_x4_omp(T* _RESTRICT y,
+		T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+
+	template< typename T >	// [y = Ax], [(y,z)]: includes MPI
+	T laplace_dp_x4_omp(T* _RESTRICT y,
+		T* _RESTRICT x, const T* _RESTRICT const z,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+
+	// * laplace residual * //
+	template< typename T >	// [y = rhs - Ax]: includes MPI
+	void laplace_residual_x4_omp(T* _RESTRICT y,
+		T* _RESTRICT x, const T* _RESTRICT const rhs,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::matvec_x4(	// [y = Ax]
+	T* _RESTRICT y, const T* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		matvec_x4_omp(y, x, nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( y ) 
+		{
+			matvec_x4_omp(y, x, nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+
+template< typename T >
+inline T poisson3d::matvec_dp_x4(	// [y = Ax], [(y,x)]
+	T* _RESTRICT y, const T* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	T dp;
+
+	if (omp_in_parallel()) {
+		dp = matvec_dp_x4_omp(y, x, nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+		dp = (T)0;
+#pragma omp parallel shared( y ) reduction( + : dp)
+		{
+			dp = matvec_dp_x4_omp(y, x, nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+
+	return dp;
+}
+
+template< typename T >
+inline T poisson3d::matvec_dp_x4(	// [y = Ax], [(y,z)]
+	T* _RESTRICT y, const T* _RESTRICT const x, const T* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	T dp;
+
+	if (omp_in_parallel()) {
+		dp = matvec_dp_x4_omp(y, x, z, nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+		dp = (T)0;
+#pragma omp parallel shared( y ) reduction( + : dp)
+		{
+			dp = matvec_dp_x4_omp(y, x, z, nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+
+	return dp;
+}
+
+template< typename T >
+inline void poisson3d::resvec_x4(	// [y = rhs - Ax]
+	T* _RESTRICT y, const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		resvec_x4_omp(y, x, rhs, nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( y ) 
+		{
+			resvec_x4_omp(y, x, rhs, nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+
+template< typename T >	// [y = Ax]: includes MPI
+inline void poisson3d::laplace_x4(T* _RESTRICT y,
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	if (omp_in_parallel()) {
+		laplace_x4_omp(y, x, nx, ny, nz, gcx, gcy, gcz,
+			dx2i, dy2i, dzp2i, dzm2i,
+
+			mpi_com, bc);
+	}
+	else
+	{
+#pragma omp parallel shared( y, x ) 
+		{
+			laplace_x4_omp(y, x, nx, ny, nz, gcx, gcy, gcz,
+				dx2i, dy2i, dzp2i, dzm2i,
+
+				mpi_com, bc);
+		}
+	}
+}
+
+template< typename T >	// [y = Ax], [(y,z)]: includes MPI
+inline T poisson3d::laplace_dp_x4(T* _RESTRICT y,
+	T* _RESTRICT x, const T* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	T dp;
+
+	if (omp_in_parallel()) {
+		dp = laplace_dp_x4_omp(y, x, z, nx, ny, nz, gcx, gcy, gcz,
+			dx2i, dy2i, dzp2i, dzm2i,
+
+			mpi_com, bc);
+	}
+	else
+	{
+		dp = (T)0;
+#pragma omp parallel shared( y, x ) reduction( + : dp)
+		{
+			dp = laplace_dp_x4_omp(y, x, z, nx, ny, nz, gcx, gcy, gcz,
+				dx2i, dy2i, dzp2i, dzm2i,
+
+				mpi_com, bc);
+		}
+
+		nse::mpi_allreduce(&dp, MPI_SUM, mpi_com.comm);
+	}
+
+	return dp;
+}
+
+template< typename T >	// [y = rhs - Ax]: includes MPI
+inline void poisson3d::laplace_residual_x4(T* _RESTRICT y,
+	T* _RESTRICT x, const T* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	if (omp_in_parallel()) {
+		laplace_residual_x4_omp(y, x, rhs, nx, ny, nz, gcx, gcy, gcz,
+			dx2i, dy2i, dzp2i, dzm2i,
+
+			mpi_com, bc);
+	}
+	else
+	{
+#pragma omp parallel shared( y, x ) 
+		{
+			laplace_residual_x4_omp(y, x, rhs, nx, ny, nz, gcx, gcy, gcz,
+				dx2i, dy2i, dzp2i, dzm2i,
+
+				mpi_com, bc);
+		}
+	}
+}
diff --git a/pois-base3d.cpp b/pois-base3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f505377265091bb044abe489139085ee9523944
--- /dev/null
+++ b/pois-base3d.cpp
@@ -0,0 +1,782 @@
+#include "pois-base3d.h"
+#include "pois-bc3d.h"
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+template< typename T >
+void poisson3d::matvec_omp(	// [y = Ax]
+	T* _RESTRICT y, const T* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			idx = i * nyz + j * nz + kb;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+			for (k = kb; k <= ke; k++, idx++)
+			{
+				y[idx] =
+					(x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) * dx2i +
+					(x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) * dy2i +
+
+					(x[idx + 1] - x[idx]) * dzp2i[k] -
+					(x[idx] - x[idx - 1]) * dzm2i[k];
+			}
+		}
+	}
+}
+
+template< typename T >
+T poisson3d::matvec_dp_omp(	// [y = Ax], [(y,x)]
+	T* _RESTRICT y, const T* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const int nyz = ny * nz;
+	const int ksh = (ke - kb + 1) & 1;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	T dp = (T)0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			idx = i * nyz + j * nz + kb;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx;
+#endif
+			for (k = kb; k <= ke - ksh; k += 2, idx += 2)
+			{
+				y[idx] =
+					(x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) * dx2i +
+					(x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) * dy2i +
+
+					(x[idx + 1] - x[idx]) * dzp2i[k] -
+					(x[idx] - x[idx - 1]) * dzm2i[k];
+
+				y[idx + 1] =
+					(x[idx + nyz + 1] - x[idx + 1] - x[idx + 1] + x[idx - nyz + 1]) * dx2i +
+					(x[idx + nz + 1] - x[idx + 1] - x[idx + 1] + x[idx - nz + 1]) * dy2i +
+
+					(x[idx + 2] - x[idx + 1]) * dzp2i[k + 1] -
+					(x[idx + 1] - x[idx]) * dzm2i[k + 1];
+
+				dp += y[idx] * x[idx] + y[idx + 1] * x[idx + 1];
+			}
+
+			if (ksh) {	// k = ke //
+#ifdef USE_OPENMP_2D_CYCLE
+				idx = i * nyz + j * nz + ke;
+#else
+				idx = shidx + ke - kb;
+#endif
+
+				y[idx] =
+					(x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) * dx2i +
+					(x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) * dy2i +
+
+					(x[idx + 1] - x[idx]) * dzp2i[ke] -
+					(x[idx] - x[idx - 1]) * dzm2i[ke];
+
+				dp += y[idx] * x[idx];
+			}
+		}
+	}
+
+	return dp;
+}
+
+template< typename T >
+T poisson3d::matvec_dp_omp(	// [y = Ax], [(y,z)]
+	T* _RESTRICT y, const T* _RESTRICT const x, const T* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const int nyz = ny * nz;
+	const int ksh = (ke - kb + 1) & 1;
+
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+	T dp = (T)0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			idx = i * nyz + j * nz + kb;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx;
+#endif
+			for (k = kb; k <= ke - ksh; k += 2, idx += 2)
+			{
+				y[idx] =
+					(x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) * dx2i +
+					(x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) * dy2i +
+
+					(x[idx + 1] - x[idx]) * dzp2i[k] -
+					(x[idx] - x[idx - 1]) * dzm2i[k];
+
+				y[idx + 1] =
+					(x[idx + nyz + 1] - x[idx + 1] - x[idx + 1] + x[idx - nyz + 1]) * dx2i +
+					(x[idx + nz + 1] - x[idx + 1] - x[idx + 1] + x[idx - nz + 1]) * dy2i +
+
+					(x[idx + 2] - x[idx + 1]) * dzp2i[k + 1] -
+					(x[idx + 1] - x[idx]) * dzm2i[k + 1];
+
+				dp += y[idx] * z[idx] + y[idx + 1] * z[idx + 1];
+			}
+
+			if (ksh) {	// k = ke //
+#ifdef USE_OPENMP_2D_CYCLE
+				idx = i * nyz + j * nz + ke;
+#else
+				idx = shidx + ke - kb;
+#endif
+
+				y[idx] =
+					(x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) * dx2i +
+					(x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) * dy2i +
+
+					(x[idx + 1] - x[idx]) * dzp2i[ke] -
+					(x[idx] - x[idx - 1]) * dzm2i[ke];
+
+				dp += y[idx] * z[idx];
+			}
+		}
+	}
+
+	return dp;
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+void poisson3d::resvec_omp(	// [y = rhs - Ax]
+	T* _RESTRICT y, const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			idx = i * nyz + j * nz + kb;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+			for (k = kb; k <= ke; k++, idx++)
+			{
+				y[idx] = rhs[idx] - (
+					(x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) * dx2i +
+					(x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) * dy2i +
+
+					(x[idx + 1] - x[idx]) * dzp2i[k] -
+					(x[idx] - x[idx - 1]) * dzm2i[k]);
+			}
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+void poisson3d::set_diagonal_inverse_omp( // diagonal inverse
+	T* _RESTRICT idg,
+	const int nz,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	int k;
+#pragma omp for nowait
+	for (k = kb; k <= ke; k++)
+		idg[k] = (T) 1.0 /
+		(-(T)2.0 * dx2i - (T)2.0 * dy2i - dzp2i[k] - dzm2i[k]);
+}
+// -------------------------------------------------------------------- //
+
+
+// * [laplace] for poisson equation with async exchanges * //
+template< typename T >
+void poisson3d::laplace_omp(T* _RESTRICT y,
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	MPI_Request mpi_req[12];
+
+	const int ib = gcx, ie = nx - gcx - 1;
+	const int jb = gcy, je = ny - gcy - 1;
+	const int kb = gcz, ke = nz - gcz - 1;
+
+	put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+		mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+		mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+
+		bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+	mpi_com.push_exchange_cross_halo(x, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.x_periodic, bc.y_periodic, bc.z_periodic, mpi_req);
+
+	matvec_omp(y, x, nx, ny, nz,
+		ib + bc.p_west, ie - bc.p_east,
+		jb + bc.p_south, je - bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top,
+
+		dx2i, dy2i,
+		dzp2i, dzm2i);
+
+	// finalize -x cross exchanges //
+	mpi_com.pop_exchange_halo_x(x, nx, ny, nz, gcx, gcy, gcz,
+		1, 0, 0, bc.x_periodic, &mpi_req[0]);
+
+	if (bc.p_west)
+		matvec_omp(y, x, nx, ny, nz,
+			ib, ib,
+			jb + bc.p_south, je - bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_east)
+		matvec_omp(y, x, nx, ny, nz,
+			ie, ie,
+			jb + bc.p_south, je - bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -y cross exchanges //
+	mpi_com.pop_exchange_halo_y(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 1, 0, bc.y_periodic, &mpi_req[4]);
+
+	if (bc.p_south)
+		matvec_omp(y, x, nx, ny, nz,
+			ib, ie,
+			jb, jb,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_north)
+		matvec_omp(y, x, nx, ny, nz,
+			ib, ie,
+			je, je,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -z cross exchanges //
+	mpi_com.pop_exchange_halo_z(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 0, 1, bc.z_periodic, &mpi_req[8]);
+
+	if (bc.p_bottom)
+		matvec_omp(y, x, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb, kb,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_top)
+		matvec_omp(y, x, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke, ke,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+}
+// ------------------------------------------------------------------------ //
+
+
+// * [laplace + dp] for poisson equation with async exchanges * //
+template< typename T >
+T poisson3d::laplace_dp_omp(T* _RESTRICT y,
+	T* _RESTRICT x, const T* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	T dp = (T)0;
+	MPI_Request mpi_req[12];
+
+	const int ib = gcx, ie = nx - gcx - 1;
+	const int jb = gcy, je = ny - gcy - 1;
+	const int kb = gcz, ke = nz - gcz - 1;
+
+	put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+		mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+		mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+
+		bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+	mpi_com.push_exchange_cross_halo(x, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.x_periodic, bc.y_periodic, bc.z_periodic, mpi_req);
+
+	dp = matvec_dp_omp(y, x, z, nx, ny, nz,
+		ib + bc.p_west, ie - bc.p_east,
+		jb + bc.p_south, je - bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top,
+
+		dx2i, dy2i,
+		dzp2i, dzm2i);
+
+	// finalize -x cross exchanges //
+	mpi_com.pop_exchange_halo_x(x, nx, ny, nz, gcx, gcy, gcz,
+		1, 0, 0, bc.x_periodic, &mpi_req[0]);
+
+	if (bc.p_west)
+		dp += matvec_dp_omp(y, x, z, nx, ny, nz,
+			ib, ib,
+			jb + bc.p_south, je - bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_east)
+		dp += matvec_dp_omp(y, x, z, nx, ny, nz,
+			ie, ie,
+			jb + bc.p_south, je - bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -y cross exchanges //
+	mpi_com.pop_exchange_halo_y(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 1, 0, bc.y_periodic, &mpi_req[4]);
+
+	if (bc.p_south)
+		dp += matvec_dp_omp(y, x, z, nx, ny, nz,
+			ib, ie,
+			jb, jb,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_north)
+		dp += matvec_dp_omp(y, x, z, nx, ny, nz,
+			ib, ie,
+			je, je,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -z cross exchanges //
+	mpi_com.pop_exchange_halo_z(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 0, 1, bc.z_periodic, &mpi_req[8]);
+
+	if (bc.p_bottom)
+		dp += matvec_dp_omp(y, x, z, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb, kb,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_top)
+		dp += matvec_dp_omp(y, x, z, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke, ke,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	return dp;
+}
+// ------------------------------------------------------------------------ //
+
+// * [laplace-residual] for poisson equation with async exchanges * //
+template< typename T >
+void poisson3d::laplace_residual_omp(T* _RESTRICT y,
+	T* _RESTRICT x, const T* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	MPI_Request mpi_req[12];
+
+	const int ib = gcx, ie = nx - gcx - 1;
+	const int jb = gcy, je = ny - gcy - 1;
+	const int kb = gcz, ke = nz - gcz - 1;
+
+	put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+		mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+		mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+
+		bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+	mpi_com.push_exchange_cross_halo(x, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.x_periodic, bc.y_periodic, bc.z_periodic, mpi_req);
+
+	resvec_omp(y, x, rhs, nx, ny, nz,
+		ib + bc.p_west, ie - bc.p_east,
+		jb + bc.p_south, je - bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top,
+
+		dx2i, dy2i,
+		dzp2i, dzm2i);
+
+	// finalize -x cross exchanges //
+	mpi_com.pop_exchange_halo_x(x, nx, ny, nz, gcx, gcy, gcz,
+		1, 0, 0, bc.x_periodic, &mpi_req[0]);
+
+	if (bc.p_west)
+		resvec_omp(y, x, rhs, nx, ny, nz,
+			ib, ib,
+			jb + bc.p_south, je - bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_east)
+		resvec_omp(y, x, rhs, nx, ny, nz,
+			ie, ie,
+			jb + bc.p_south, je - bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -y cross exchanges //
+	mpi_com.pop_exchange_halo_y(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 1, 0, bc.y_periodic, &mpi_req[4]);
+
+	if (bc.p_south)
+		resvec_omp(y, x, rhs, nx, ny, nz,
+			ib, ie,
+			jb, jb,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_north)
+		resvec_omp(y, x, rhs, nx, ny, nz,
+			ib, ie,
+			je, je,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+
+	// finalize -z cross exchanges //
+	mpi_com.pop_exchange_halo_z(x, nx, ny, nz, gcx, gcy, gcz,
+		0, 0, 1, bc.z_periodic, &mpi_req[8]);
+
+	if (bc.p_bottom)
+		resvec_omp(y, x, rhs, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb, kb,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+	if (bc.p_top)
+		resvec_omp(y, x, rhs, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke, ke,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+}
+// ------------------------------------------------------------------------ //
+
+
+// Initialization //
+// ------------------------------------------------------------------------ //
+
+// * initialize: matvec * //
+template void poisson3d::matvec_omp(float* _RESTRICT y,
+	const float* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i);
+template void poisson3d::matvec_omp(double* _RESTRICT y,
+	const double* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i);
+// ------------------------------------------------------------------------ // 
+
+template float poisson3d::matvec_dp_omp(float* _RESTRICT y,
+	const float* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i);
+template double poisson3d::matvec_dp_omp(double* _RESTRICT y,
+	const double* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i);
+// ------------------------------------------------------------------------ //
+
+template float poisson3d::matvec_dp_omp(float* _RESTRICT y,
+	const float* _RESTRICT const x, const float* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i);
+template double poisson3d::matvec_dp_omp(double* _RESTRICT y,
+	const double* _RESTRICT const x, const double* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i);
+// ------------------------------------------------------------------------ //
+
+// * initialize: resvec * //
+template void poisson3d::resvec_omp(float* _RESTRICT y,
+	const float* _RESTRICT const x, const float* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i);
+template void poisson3d::resvec_omp(double* _RESTRICT y,
+	const double* _RESTRICT const x, const double* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i);
+// ------------------------------------------------------------------------ //
+
+// * initialize: diagonal inverse * //
+template void poisson3d::set_diagonal_inverse_omp(float* _RESTRICT idg,
+	const int nz,
+	const int kb, const int ke,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i);
+template void poisson3d::set_diagonal_inverse_omp(double* _RESTRICT idg,
+	const int nz,
+	const int kb, const int ke,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i);
+// ------------------------------------------------------------------------ //
+
+// * initialize: laplace operator * //
+template void poisson3d::laplace_omp(float* _RESTRICT y,
+	float* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+template void poisson3d::laplace_omp(double* _RESTRICT y,
+	double* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+// ------------------------------------------------------------------------ //
+
+// * initialize: laplace operator & dot product * //
+template float poisson3d::laplace_dp_omp(float* _RESTRICT y,
+	float* _RESTRICT x, const float* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+template double poisson3d::laplace_dp_omp(double* _RESTRICT y,
+	double* _RESTRICT x, const double* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+// ------------------------------------------------------------------------ //
+
+// * initialize: laplace residual * //
+template void poisson3d::laplace_residual_omp(float* _RESTRICT y,
+	float* _RESTRICT x, const float* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+template void poisson3d::laplace_residual_omp(double* _RESTRICT y,
+	double* _RESTRICT x, const double* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+// ------------------------------------------------------------------------ //
diff --git a/pois-base3d.h b/pois-base3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0fc6c5bcf7cf6d0b8b3d3db729c832690a7c643
--- /dev/null
+++ b/pois-base3d.h
@@ -0,0 +1,450 @@
+#pragma once
+
+// [pois-base3d.h]: 3D Poisson[x2] basic operations
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "mpi-com3d.h"
+#include "pois-def3d.h"
+
+
+namespace poisson3d
+{
+	// * matvec * //
+	template< typename T >	// [y = Ax]
+	void matvec(T* _RESTRICT y,
+		const T* _RESTRICT const x,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	template< typename T >	// [y = Ax], [(y,x)]
+	T matvec_dp(T* _RESTRICT y,
+		const T* _RESTRICT const x,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	template< typename T >	// [y = Ax], [(y,z)]
+	T matvec_dp(T* _RESTRICT y,
+		const T* _RESTRICT const x, const T* _RESTRICT const z,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+	// * resvec * //
+	template< typename T >	// [y = rhs - Ax]
+	void resvec(T* _RESTRICT y,
+		const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+	// * diagonal inverse * //
+	template< typename T >
+	void set_diagonal_inverse(T* _RESTRICT idg,		// -z dependence only
+		const int nz,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+	// * laplace * //
+	template< typename T >	// [y = Ax]: includes MPI
+	void laplace(T* _RESTRICT y,
+		T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+
+	template< typename T >	// [y = Ax], [(y,z)]: includes MPI
+	T laplace_dp(T* _RESTRICT y,
+		T* _RESTRICT x, const T* _RESTRICT const z,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+
+	// * laplace residual * //
+	template< typename T >	// [y = rhs - Ax]: includes MPI
+	void laplace_residual(T* _RESTRICT y,
+		T* _RESTRICT x, const T* _RESTRICT const rhs,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// OpenMP //
+namespace poisson3d
+{
+	// * matvec * //
+	template< typename T >	// [y = Ax]
+	void matvec_omp(T* _RESTRICT y,
+		const T* _RESTRICT const x,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	template< typename T >	// [y = Ax], [(y,x)]
+	T matvec_dp_omp(T* _RESTRICT y,
+		const T* _RESTRICT const x,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	template< typename T >	// [y = Ax], [(y,z)]
+	T matvec_dp_omp(T* _RESTRICT y,
+		const T* _RESTRICT const x, const T* _RESTRICT const z,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+	// * resvec * //
+	template< typename T >	// [y = rhs - Ax]
+	void resvec_omp(T* _RESTRICT y,
+		const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+	// * diagonal inverse * //
+	template< typename T >
+	void set_diagonal_inverse_omp(T* _RESTRICT idg,		// -z dependence only
+		const int nz,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+	// * laplace * //
+	template< typename T >	// [y = Ax]: includes MPI
+	void laplace_omp(T* _RESTRICT y,
+		T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+
+	template< typename T >	// [y = Ax], [(y,z)]: includes MPI
+	T laplace_dp_omp(T* _RESTRICT y,
+		T* _RESTRICT x, const T* _RESTRICT const z,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+
+	// * laplace residual * //
+	template< typename T >	// [y = rhs - Ax]: includes MPI
+	void laplace_residual_omp(T* _RESTRICT y,
+		T* _RESTRICT x, const T* _RESTRICT const rhs,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::matvec(	// [y = Ax]
+	T* _RESTRICT y,
+	const T* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		matvec_omp(y, x, nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( y ) 
+		{
+			matvec_omp(y, x, nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+
+template< typename T >
+inline T poisson3d::matvec_dp(	// [y = Ax], [(y,x)]
+	T* _RESTRICT y,
+	const T* _RESTRICT const x,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	T dp;
+
+	if (omp_in_parallel()) {
+		dp = matvec_dp_omp(y, x, nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+		dp = (T)0;
+#pragma omp parallel shared( y ) reduction( + : dp)
+		{
+			dp = matvec_dp_omp(y, x, nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+
+	return dp;
+}
+
+template< typename T >
+inline T poisson3d::matvec_dp(	// [y = Ax], [(y,z)]
+	T* _RESTRICT y,
+	const T* _RESTRICT const x, const T* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	T dp;
+
+	if (omp_in_parallel()) {
+		dp = matvec_dp_omp(y, x, z, nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+		dp = (T)0;
+#pragma omp parallel shared( y ) reduction( + : dp)
+		{
+			dp = matvec_dp_omp(y, x, z, nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+
+	return dp;
+}
+
+template< typename T >
+inline void poisson3d::resvec(	// [y = rhs - Ax]
+	T* _RESTRICT y,
+	const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		resvec_omp(y, x, rhs, nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( y ) 
+		{
+			resvec_omp(y, x, rhs, nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+
+template< typename T >
+void poisson3d::set_diagonal_inverse(	// diagonal inverse, -z dependence only
+	T* _RESTRICT idg,
+	const int nz,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		set_diagonal_inverse_omp(idg, nz, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( idg )
+		{
+			set_diagonal_inverse_omp(idg, nz, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+
+template< typename T >	// [y = Ax]: includes MPI
+inline void poisson3d::laplace(T* _RESTRICT y,
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	if (omp_in_parallel()) {
+		laplace_omp(y, x, nx, ny, nz, gcx, gcy, gcz,
+			dx2i, dy2i, dzp2i, dzm2i,
+
+			mpi_com, bc);
+	}
+	else
+	{
+#pragma omp parallel shared( y, x ) 
+		{
+			laplace_omp(y, x, nx, ny, nz, gcx, gcy, gcz,
+				dx2i, dy2i, dzp2i, dzm2i,
+
+				mpi_com, bc);
+		}
+	}
+}
+
+template< typename T >	// [y = Ax], [(y,z)]: includes MPI
+inline T poisson3d::laplace_dp(T* _RESTRICT y,
+	T* _RESTRICT x, const T* _RESTRICT const z,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	T dp;
+
+	if (omp_in_parallel()) {
+		dp = laplace_dp_omp(y, x, z, nx, ny, nz, gcx, gcy, gcz,
+			dx2i, dy2i, dzp2i, dzm2i,
+
+			mpi_com, bc);
+	}
+	else
+	{
+		dp = (T)0;
+#pragma omp parallel shared( y, x ) reduction( + : dp)
+		{
+			dp = laplace_dp_omp(y, x, z, nx, ny, nz, gcx, gcy, gcz,
+				dx2i, dy2i, dzp2i, dzm2i,
+
+				mpi_com, bc);
+		}
+
+		nse::mpi_allreduce(&dp, MPI_SUM, mpi_com.comm);
+	}
+
+	return dp;
+}
+
+template< typename T >	// [y = rhs - Ax]: includes MPI
+inline void poisson3d::laplace_residual(T* _RESTRICT y,
+	T* _RESTRICT x, const T* _RESTRICT const rhs,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	if (omp_in_parallel()) {
+		laplace_residual_omp(y, x, rhs, nx, ny, nz, gcx, gcy, gcz,
+			dx2i, dy2i, dzp2i, dzm2i,
+
+			mpi_com, bc);
+	}
+	else
+	{
+#pragma omp parallel shared( y, x ) 
+		{
+			laplace_residual_omp(y, x, rhs, nx, ny, nz, gcx, gcy, gcz,
+				dx2i, dy2i, dzp2i, dzm2i,
+
+				mpi_com, bc);
+		}
+	}
+}
diff --git a/pois-bc3d.cpp b/pois-bc3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..59c7da107d318c1a7ea962990e3b27714d3a10ee
--- /dev/null
+++ b/pois-bc3d.cpp
@@ -0,0 +1,311 @@
+#include "pois-bc3d.h"
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+
+// * Boundary Conditions * //
+template< typename T >
+void poisson3d::put_bc_omp( // poisson boundary conditions
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int rank_x, const int rank_y, const int rank_z,
+	const int size_x, const int size_y, const int size_z,
+
+	const nse::poissonTypeBC bc_type,
+	const int x_periodic, const int y_periodic, const int z_periodic)
+{
+	const int ib = ((rank_x > 0) || ((rank_x == 0) && (x_periodic))) ? 0 : gcx;
+	const int ie = ((rank_x < size_x - 1) || ((rank_x == size_x - 1) && (x_periodic))) ? nx : nx - gcx;
+	const int jb = ((rank_y > 0) || ((rank_y == 0) && (y_periodic))) ? 0 : gcy;
+	const int je = ((rank_y < size_y - 1) || ((rank_y == size_y - 1) && (y_periodic))) ? ny : ny - gcy;
+	const int kb = ((rank_z > 0) || ((rank_z == 0) && (z_periodic))) ? 0 : gcz;
+	const int ke = ((rank_z < size_z - 1) || ((rank_z == size_z - 1) && (z_periodic))) ? nz : nz - gcz;
+
+	const int nyz = ny * nz;
+	int i, j, k, idx;
+
+	// west boundary condition //
+	if ((rank_x == 0) && (!x_periodic))
+	{
+		// x = 0, y[z] = 0 --> j[k] = gcy[z] to ny[z]
+		// x = 0, y[z] = size_y[size_z] - 1 --> j[k] = 0 to ny[z] - gcy[z]
+		// x = 0, 0 < y[z] < size_y[size_z] - 1 --> j[k] = 0 to ny[z]
+
+		if (bc_type == nse::westExt) {  // west outflow //
+#pragma omp for nowait
+			for (j = jb; j < je; j++) {
+				idx = (gcx - 1) * nyz + j * nz;
+				for (k = kb; k < ke; k++)
+					x[idx + k] = -x[idx + nyz + k];
+			}
+		}
+		else	// neumann //
+		{
+#pragma omp for nowait
+			for (j = jb; j < je; j++) {
+				idx = (gcx - 1) * nyz + j * nz;
+				for (k = kb; k < ke; k++)
+					x[idx + k] = x[idx + nyz + k];
+			}
+		}
+	}
+
+	// east boundary condition //
+	if ((rank_x == size_x - 1) && (!x_periodic))
+	{
+		if ((bc_type == nse::eastExt) ||
+			(bc_type == nse::periodicY_eastExt)) {  // east outflow //
+#pragma omp for nowait
+			for (j = jb; j < je; j++) {
+				idx = (nx - gcx) * nyz + j * nz;
+				for (k = kb; k < ke; k++)
+					x[idx + k] = -x[idx - nyz + k];
+			}
+		}
+		else	// neumann //
+		{
+#pragma omp for nowait
+			for (j = jb; j < je; j++) {
+				idx = (nx - gcx) * nyz + j * nz;
+				for (k = kb; k < ke; k++)
+					x[idx + k] = x[idx - nyz + k];
+			}
+		}
+	}
+
+	// south boundary conditions //
+	if ((rank_y == 0) && (!y_periodic))
+	{
+		// x[z] = 0, y = 0 --> i[k] = gcx[z] to nx[z]
+		// x[z] = size_x[size_z] - 1, y = 0 --> i[k] = 0 to nx[z] - gcx[z]
+		// 0 < x[z] < size_x[size_z] - 1, y = 0 --> i[k] = 0 to nx[z]
+
+		if (bc_type == nse::southExt) { // south outflow //
+#pragma omp for nowait
+			for (i = ib; i < ie; i++) {
+				idx = i * nyz + (gcy - 1) * nz;
+				for (k = kb; k < ke; k++)
+					x[idx + k] = -x[idx + nz + k];
+			}
+		}
+		else	// neumann //
+		{
+#pragma omp for nowait
+			for (i = ib; i < ie; i++) {
+				idx = i * nyz + (gcy - 1) * nz;
+				for (k = kb; k < ke; k++)
+					x[idx + k] = x[idx + nz + k];
+			}
+		}
+	}
+
+	// north boundary conditions //
+	if ((rank_y == size_y - 1) && (!y_periodic))
+	{
+		if (bc_type == nse::northExt) { // north outflow //
+#pragma omp for nowait
+			for (i = ib; i < ie; i++) {
+				idx = i * nyz + (ny - gcy) * nz;
+				for (k = kb; k < ke; k++)
+					x[idx + k] = -x[idx - nz + k];
+			}
+		}
+		else	// neumann //
+		{
+#pragma omp for nowait
+			for (i = ib; i < ie; i++) {
+				idx = i * nyz + (ny - gcy) * nz;
+				for (k = kb; k < ke; k++)
+					x[idx + k] = x[idx - nz + k];
+			}
+		}
+	}
+
+	// bottom boundary conditions //
+	if ((rank_z == 0) && (!z_periodic))
+	{
+		// x[y] = 0, z = 0 - > i[j] = gcx[y] to nx[y]
+		// x[y] = size_x[size_y] - 1, z = 0 --> i[j] = 0 to nx[y] - gcx[y]
+		// 0 < x[y] < size_x[size_y] - 1, z = 0 --> i[j] = 0 to nx[y]
+
+		if (bc_type == nse::bottomExt) { // bottom outflow //
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+			for (i = ib; i < ie; i++) {
+				for (j = jb; j < je; j++) {
+					idx = i * nyz + j * nz + gcz - 1;
+					x[idx] = -x[idx + 1];
+				}
+			}
+#else
+#pragma omp for nowait
+			for (i = ib; i < ie; i++) {
+				idx = i * nyz + jb * nz + gcz - 1;
+				for (j = jb; j < je; j++, idx += nz)
+					x[idx] = -x[idx + 1];
+			}
+#endif
+		}
+		else	// neumann //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+			for (i = ib; i < ie; i++) {
+				for (j = jb; j < je; j++) {
+					idx = i * nyz + j * nz + gcz - 1;
+					x[idx] = x[idx + 1];
+				}
+			}
+#else
+#pragma omp for nowait
+			for (i = ib; i < ie; i++) {
+				idx = i * nyz + jb * nz + gcz - 1;
+				for (j = jb; j < je; j++, idx += nz) {
+					x[idx] = x[idx + 1];
+				}
+			}
+#endif
+		}
+	}
+
+	// top boundary conditions //
+	if ((rank_z == size_z - 1) && (!z_periodic))
+	{
+		if (bc_type == nse::topExt) { // top outflow //
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+			for (i = ib; i < ie; i++) {
+				for (j = jb; j < je; j++) {
+					idx = i * nyz + j * nz + nz - gcz;
+					x[idx] = -x[idx - 1];
+				}
+			}
+#else
+#pragma omp for nowait
+			for (i = ib; i < ie; i++) {
+				idx = i * nyz + jb * nz + nz - gcz;
+				for (j = jb; j < je; j++, idx += nz)
+					x[idx] = -x[idx - 1];
+			}
+#endif
+		}
+		else	// neumann //
+		{
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+			for (i = ib; i < ie; i++) {
+				for (j = jb; j < je; j++) {
+					idx = i * nyz + j * nz + nz - gcz;
+					x[idx] = x[idx - 1];
+				}
+			}
+#else
+#pragma omp for nowait
+			for (i = ib; i < ie; i++) {
+				idx = i * nyz + jb * nz + nz - gcz;
+				for (j = jb; j < je; j++, idx += nz)
+					x[idx] = x[idx - 1];
+			}
+#endif
+		}
+	}
+
+#ifdef _POIS3D_BC_DIRICHLET_POINT	// special handler for resolving pure neumann system //
+	if ((rank_x == 0) && (rank_y == 0) && (rank_z == 0)) {
+
+#pragma omp barrier
+
+		// special handler for -y[yz] periodic bc //
+		if ((!x_periodic) && (
+			(bc_type == nse::periodicY) ||
+			(bc_type == nse::periodicYZ)))
+		{
+			idx = (gcx - 1) * nyz + gcy * nz + gcz;
+#pragma omp master
+			x[idx] = -x[idx + nyz];
+		}
+		// special handler for -z[xz] periodic bc //
+		if ((!y_periodic) && (
+			(bc_type == nse::periodicZ) ||
+			(bc_type == nse::periodicXZ)))
+		{
+			idx = gcx * nyz + (gcy - 1) * nz + gcz;
+#pragma omp master
+			x[idx] = -x[idx + nz];
+		}
+		// special handler for -x[xy] periodic bc //
+		if ((!z_periodic) && (
+			(bc_type == nse::periodicX) ||
+			(bc_type == nse::periodicXY)))
+		{
+			idx = gcx * nyz + gcy * nz + (gcz - 1);
+#pragma omp master
+			x[idx] = -x[idx + 1];
+		}
+	}
+#endif
+
+}
+
+template< typename T >
+void poisson3d::put_exch_bc_omp( // poisson boundary conditions
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	put_bc_omp(x,
+		nx, ny, nz,
+		gcx, gcy, gcz,
+		mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+		mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+
+		bc.type,
+		bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+	// * MPI exchanges * //
+	mpi_com.exchange_cross_halo(x, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+}
+// ------------------------------------------------------------------------ //
+
+
+// Initialization //
+// ------------------------------------------------------------------------ //
+
+
+// * initialize: boundary conditions * //
+template void poisson3d::put_bc_omp(float* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int rank_x, const int rank_y, const int rank_z,
+	const int size_x, const int size_y, const int size_z,
+
+	const nse::poissonTypeBC bc_type,
+	const int x_periodic, const int y_periodic, const int z_periodic);
+template void poisson3d::put_bc_omp(double* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int rank_x, const int rank_y, const int rank_z,
+	const int size_x, const int size_y, const int size_z,
+
+	const nse::poissonTypeBC bc_type,
+	const int x_periodic, const int y_periodic, const int z_periodic);
+
+template void poisson3d::put_exch_bc_omp(float* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc);
+template void poisson3d::put_exch_bc_omp(double* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc);
+// ------------------------------------------------------------------------ //
diff --git a/pois-bc3d.h b/pois-bc3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed2ff318fe0390f3921156fbacba7b0ac9a07c40
--- /dev/null
+++ b/pois-bc3d.h
@@ -0,0 +1,115 @@
+#pragma once
+
+// [pois-bc3d.h]: 3D Poisson[x2] boundary conditions
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "mpi-com3d.h"
+#include "pois-def3d.h"
+
+
+namespace poisson3d
+{
+	// * boundary conditions * //
+	template< typename T >          // pure BC //
+	void put_bc(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const int rank_x, const int rank_y, const int rank_z,
+		const int size_x, const int size_y, const int size_z,
+
+		const nse::poissonTypeBC bc_type,
+		const int x_periodic, const int y_periodic, const int z_periodic);
+
+	template< typename T >  // BC and MPI cross exchanges //
+	void put_exch_bc(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// OpenMP //
+namespace poisson3d
+{
+	// * boundary conditions * //
+	template< typename T >          // pure BC //
+	void put_bc_omp(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const int rank_x, const int rank_y, const int rank_z,
+		const int size_x, const int size_y, const int size_z,
+
+		const nse::poissonTypeBC bc_type,
+		const int x_periodic, const int y_periodic, const int z_periodic);
+
+	template< typename T >  // BC and MPI cross exchanges //
+	void put_exch_bc_omp(T* _RESTRICT x,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::put_bc( // poisson boundary conditions
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int rank_x, const int rank_y, const int rank_z,
+	const int size_x, const int size_y, const int size_z,
+
+	const nse::poissonTypeBC bc_type,
+	const int x_periodic, const int y_periodic, const int z_periodic)
+{
+	if (omp_in_parallel()) {
+		put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+			rank_x, rank_y, rank_z, size_x, size_y, size_z,
+
+			bc_type, x_periodic, y_periodic, z_periodic);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+				rank_x, rank_y, rank_z, size_x, size_y, size_z,
+
+				bc_type, x_periodic, y_periodic, z_periodic);
+		}
+	}
+}
+
+template< typename T >
+inline void poisson3d::put_exch_bc(	// poisson boundary conditions and MPI cross exchange
+	T* _RESTRICT x,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	if (omp_in_parallel()) {
+		put_exch_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+			mpi_com, bc);
+	}
+	else
+	{
+#pragma omp parallel shared( x )
+		{
+			put_exch_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+				mpi_com, bc);
+		}
+	}
+}
diff --git a/pois-def3d.h b/pois-def3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcd72d478d20bc6a270b0ad5c30cfc1a12738796
--- /dev/null
+++ b/pois-def3d.h
@@ -0,0 +1,38 @@
+#pragma once
+
+// [pois-def3d.h]: 3D Poisson solver definitions & boundary conditions constants
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "pois-setup3d.h"
+
+// -------------------------------------------------------------------- //
+
+
+#ifdef USE_EXPLICIT_SSE
+//#define _POIS3D_USE_SSE				// use sse calls //
+#endif
+
+#define _POIS3D_SKIRMISH_NORM			// use additional norm check for skirmish residual //
+
+#define _POIS3D_LAPLACE_WITH_DP			// combine laplace operator with dot products //
+#define _POIS3D_COMBINE_UPDATES			// combine updates for solution and residual in single OpenMP sweep //
+
+#define _POIS3D_EXCLUDE_XGHOST_VEC		// exclude -x ghost cells from vector computations //
+// accounts for data alignment //
+
+//#define _POIS3D_BC_DIRICHLET_POINT    // dirichlet point with periodic boundary conditions //
+
+// -------------------------------------------------------------------- //
+
+// * poisson solver MPI async setup * //
+namespace poisson3d {
+
+	// -I domain partition for MPI-async exchanges of full halo //
+	const int c_i_partition_async_halo = 2;
+	const int c_i_partition_min_step = 2;
+
+	const int c_small_poisson_size = 64;	// 64 // // small problem size - use sync-MPI exchanges //
+}
+// -------------------------------------------------------------------- //
diff --git a/pois-gs-base3d.h b/pois-gs-base3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c8dc111f74d35f8b53076ae872f5708cf446a52
--- /dev/null
+++ b/pois-gs-base3d.h
@@ -0,0 +1,502 @@
+#pragma once
+
+// [pois-gs-base3d.h]: 3D Poisson Gauss-Seidel base components
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+
+
+namespace poisson3d
+{
+	// * GS init * //
+	template< typename T >
+	void gs_init(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke);
+
+	// * GS color cycle * //
+	template< typename T >
+	void gs_cycle(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+
+	// * GS init [Halo] * //
+	template< typename T >
+	void gs_init_halo(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const int p_west, const int p_east,
+		const int p_south, const int p_north,
+		const int p_bottom, const int p_top);
+
+	// * GS color cycle [Halo] * //
+	template< typename T >
+	void gs_cycle_halo(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const int p_west, const int p_east,
+		const int p_south, const int p_north,
+		const int p_bottom, const int p_top,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+}
+
+// OpenMP //
+namespace poisson3d
+{
+	// * GS init * //
+	template< typename T >
+	void gs_init_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke);
+
+	// * GS color cycle * //
+	template< typename T >
+	void gs_cycle_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+
+	// * GS init [Halo] * //
+	template< typename T >
+	void gs_init_halo_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const int p_west, const int p_east,
+		const int p_south, const int p_north,
+		const int p_bottom, const int p_top);
+
+	// * GS color cycle [Halo] * //
+	template< typename T >
+	void gs_cycle_halo_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const int p_west, const int p_east,
+		const int p_south, const int p_north,
+		const int p_bottom, const int p_top,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+}
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::gs_init(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke)
+{
+	if (omp_in_parallel()) {
+		gs_init_omp(x, rhs, idg, color,
+			nx, ny, nz, ib, ie, jb, je, kb, ke);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			gs_init_omp(x, rhs, idg, color,
+				nx, ny, nz, ib, ie, jb, je, kb, ke);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::gs_cycle(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		gs_cycle_omp(x, rhs, idg, color,
+			nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			gs_cycle_omp(x, rhs, idg, color,
+				nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::gs_init_halo(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const int p_west, const int p_east,
+	const int p_south, const int p_north,
+	const int p_bottom, const int p_top)
+{
+	if (omp_in_parallel()) {
+		gs_init_halo_omp(x, rhs, idg, color,
+			nx, ny, nz, ib, ie, jb, je, kb, ke,
+			p_west, p_east, p_south, p_north, p_bottom, p_top);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			gs_init_halo_omp(x, rhs, idg, color,
+				nx, ny, nz, ib, ie, jb, je, kb, ke,
+				p_west, p_east, p_south, p_north, p_bottom, p_top);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+
+template< typename T >
+inline void poisson3d::gs_cycle_halo(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const int p_west, const int p_east,
+	const int p_south, const int p_north,
+	const int p_bottom, const int p_top,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		gs_cycle_halo_omp(x, rhs, idg, color,
+			nx, ny, nz, ib, ie, jb, je, kb, ke,
+			p_west, p_east, p_south, p_north, p_bottom, p_top,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			gs_cycle_halo_omp(x, rhs, idg, color,
+				nx, ny, nz, ib, ie, jb, je, kb, ke,
+				p_west, p_east, p_south, p_north, p_bottom, p_top,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+
+// * OpenMP versions * // 
+// -------------------------------------------------------------------- //
+template< typename T >
+inline void poisson3d::gs_init_omp(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx, shc;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			shc = ((i + j + kb + color) & 1);
+			idx = i * nyz + j * nz + kb + shc;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		shc = ((i + jb + kb + color) & 1);
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx + shc;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+			for (k = kb + shc; k <= ke; k += 2, idx += 2) {
+				x[idx] = idg[k] * rhs[idx];
+			}
+#ifndef USE_OPENMP_2D_CYCLE
+			shc = !shc;
+#endif
+		}
+	}
+		}
+// -------------------------------------------------------------------- //
+
+
+template< typename T >
+inline void poisson3d::gs_cycle_omp(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx, shc;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			shc = ((i + j + kb + color) & 1);
+			idx = i * nyz + j * nz + kb + shc;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		shc = ((i + jb + kb + color) & 1);
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx + shc;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+			for (k = kb + shc; k <= ke; k += 2, idx += 2) {
+				x[idx] = idg[k] * (rhs[idx] - (
+					(x[idx + nyz] + x[idx - nyz]) * dx2i +
+					(x[idx + nz] + x[idx - nz]) * dy2i +
+					(x[idx + 1] * dzp2i[k] + x[idx - 1] * dzm2i[k])));
+			}
+#ifndef USE_OPENMP_2D_CYCLE
+			shc = !shc;
+#endif
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::gs_init_halo_omp(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const int p_west, const int p_east,
+	const int p_south, const int p_north,
+	const int p_bottom, const int p_top)
+{
+	// west strip //
+	if (p_west) {
+		gs_init_omp(x, rhs, idg, color, nx, ny, nz,
+			ib - p_west, ib - 1,
+			jb - p_south, je + p_north,
+			kb - p_bottom, ke + p_top);
+	}
+	// east strip //
+	if (p_east) {
+		gs_init_omp(x, rhs, idg, color, nx, ny, nz,
+			ie + 1, ie + p_east,
+			jb - p_south, je + p_north,
+			kb - p_bottom, ke + p_top);
+	}
+	// south strip //
+	if (p_south) {
+		gs_init_omp(x, rhs, idg, color, nx, ny, nz,
+			ib, ie,
+			jb - p_south, jb - 1,
+			kb - p_bottom, ke + p_top);
+	}
+	// north strip //
+	if (p_north) {
+		gs_init_omp(x, rhs, idg, color, nx, ny, nz,
+			ib, ie,
+			je + 1, je + p_north,
+			kb - p_bottom, ke + p_top);
+	}
+	// bottom strip //
+	if (p_bottom) {
+		gs_init_omp(x, rhs, idg, color, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb - p_bottom, kb - 1);
+	}
+	// top strip //
+	if (p_top) {
+		gs_init_omp(x, rhs, idg, color, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke + 1, ke + p_top);
+	}
+}
+// -------------------------------------------------------------------- //
+
+
+template< typename T >
+inline void poisson3d::gs_cycle_halo_omp(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const int p_west, const int p_east,
+	const int p_south, const int p_north,
+	const int p_bottom, const int p_top,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	// west strip //
+	if (p_west) {
+		gs_cycle_omp(x, rhs, idg, color, nx, ny, nz,
+			ib - p_west, ib - 1,
+			jb - p_south, je + p_north,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// east strip //
+	if (p_east) {
+		gs_cycle_omp(x, rhs, idg, color, nx, ny, nz,
+			ie + 1, ie + p_east,
+			jb - p_south, je + p_north,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// south strip //
+	if (p_south) {
+		gs_cycle_omp(x, rhs, idg, color, nx, ny, nz,
+			ib, ie,
+			jb - p_south, jb - 1,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// north strip //
+	if (p_north) {
+		gs_cycle_omp(x, rhs, idg, color, nx, ny, nz,
+			ib, ie,
+			je + 1, je + p_north,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// bottom strip //
+	if (p_bottom) {
+		gs_cycle_omp(x, rhs, idg, color, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb - p_bottom, kb - 1,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// top strip //
+	if (p_top) {
+		gs_cycle_omp(x, rhs, idg, color, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke + 1, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+}
+// -------------------------------------------------------------------- //
diff --git a/pois-gs3d.cpp b/pois-gs3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..56a95eee1afed1eb1b39e67a6059a7ff73353c27
--- /dev/null
+++ b/pois-gs3d.cpp
@@ -0,0 +1,494 @@
+#include "pois-gs3d.h"
+#include "grid-common3d.h"
+#include "pois-bc3d.h"
+#include "pois-gs-base3d.h"
+
+
+namespace poisson3d
+{
+	// * GS Red-Black call-partition * //
+	template< typename T >
+	void gs_start_omp(T* _RESTRICT x,
+		T* _RESTRICT rhs, const T* _RESTRICT const idg,
+
+		const int color_mode,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc);
+
+	template< typename T >
+	void gs_run_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+
+		const int type, const int color_mode, const int piters,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+// * GS Red-Black preconditioner for Poisson equation * //
+template< typename T >
+void poisson3d::gs_redblack_omp(
+	T* _RESTRICT x,
+	T* _RESTRICT rhs, const T* _RESTRICT const idg,
+	const int type, const int color_mode, const int piters,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	if (type == c_gs_init) {
+		gs_start_omp(x, rhs, idg, color_mode,
+			nx, ny, nz, gcx, gcy, gcz,
+			dx2i, dy2i,
+			dzp2i, dzm2i,
+
+			mpi_com, bc);
+	}
+
+	gs_run_omp(x, rhs, idg,
+		type, color_mode, piters,
+		nx, ny, nz, gcx, gcy, gcz,
+		dx2i, dy2i,
+		dzp2i, dzm2i,
+
+		mpi_com, bc);
+}
+// ------------------------------------------------------------------------ //
+
+
+// * GS-RB starting routine - some optimizations for case x = 0 * //
+template< typename T >
+void poisson3d::gs_start_omp(
+	T* _RESTRICT x,
+	T* _RESTRICT rhs, const T* _RESTRICT const idg,
+
+	const int color_mode,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc)
+{
+	const int c_red = color_mode;
+	const int c_black = !color_mode;
+
+	const int ib = gcx, ie = nx - gcx - 1;
+	const int jb = gcy, je = ny - gcy - 1;
+	const int kb = gcz, ke = nz - gcz - 1;
+	const int nsize = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);
+
+
+	// Special cases //
+	// ----------------------------------------------------------- //
+	if ((mpi_com.size == 1) ||				// single MPI processor GS //
+		(nsize <= c_small_poisson_size))	// small problem size GS //
+	{
+		nse::null_halo_omp(x, nx, ny, nz,
+			ib - bc.p_west, ie + bc.p_east,
+			jb - bc.p_south, je + bc.p_north,
+			kb - bc.p_bottom, ke + bc.p_top);
+
+		// ghost exchange & periodicity //
+		// only [red] for combined red-black single sweep //
+		if (mpi_com.size == 1) {
+			if (bc.x_periodic) {
+				nse::apply_periodic_x_omp(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+					1, 1, 1);
+#pragma omp barrier
+			}
+			if (bc.y_periodic) {
+				nse::apply_periodic_y_omp(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+					1, 1, 1);
+#pragma omp barrier
+			}
+			if (bc.z_periodic) {
+				nse::apply_periodic_z_omp(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+					1, 1, 1);
+#pragma omp barrier
+			}
+		}
+		else
+		{
+			mpi_com.exchange_color_halo(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+				1, 1, 1, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+		}
+
+		// init: [red] with shift = - 1 //
+		gs_init_omp(x, rhs, idg, c_red, nx, ny, nz,
+			ib - bc.p_west, ie + bc.p_east,
+			jb - bc.p_south, je + bc.p_north,
+			kb - bc.p_bottom, ke + bc.p_top);
+
+#pragma omp barrier
+
+		// [black] with shift = 0 //
+		gs_cycle_omp(x, rhs, idg, c_black, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb, ke,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+#pragma omp barrier
+
+		return;
+	}
+	// ----------------------------------------------------------- //
+
+	// MPI-Async[x,y] GS //
+	// ----------------------------------------------------------- //
+	const int num_omp_threads = omp_get_num_threads();
+	MPI_Request mpi_req[4];
+
+	// ghost exchange & periodicity //
+	// only [red] for combined red-black single sweep //
+
+	// -x: push exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.push_exchange_color_halo_x(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.x_periodic, mpi_req);
+
+#pragma omp master
+	{
+		if ((mpi_com.size_x > 1) && (num_omp_threads > 1)) {
+			MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+			for (int k = 0; k < 4; k++)
+				mpi_req[k] = MPI_REQUEST_NULL;
+		}
+	}
+
+	nse::null_halo_omp(x, nx, ny, nz,
+		ib - bc.p_west, ie + bc.p_east,
+		jb - bc.p_south, je + bc.p_north,
+		kb - bc.p_bottom, ke + bc.p_top);
+
+	// [Red] init main block //
+	gs_init_omp(x, rhs, idg, c_red, nx, ny, nz,
+		ib, ie,
+		jb, je,
+		kb, ke);
+
+	// -x: pop exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.pop_exchange_color_halo_x(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.x_periodic, mpi_req);
+
+	// -y: push exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.push_exchange_color_halo_y(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.y_periodic, mpi_req);
+
+#pragma omp master
+	{
+		if ((mpi_com.size_y > 1) && (num_omp_threads > 1)) {
+			MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+			for (int k = 0; k < 4; k++)
+				mpi_req[k] = MPI_REQUEST_NULL;
+		}
+	}
+
+	// [Black] main block //
+	gs_cycle_omp(x, rhs, idg, c_black, nx, ny, nz,
+		ib + bc.p_west, ie - bc.p_east,
+		jb + bc.p_south, je - bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top,
+
+		dx2i, dy2i,
+		dzp2i, dzm2i);
+
+	// -y: pop exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.pop_exchange_color_halo_y(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.y_periodic, mpi_req);
+
+	// -z: push exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.push_exchange_color_halo_z(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.z_periodic, mpi_req);
+
+	// -z: pop exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.pop_exchange_color_halo_z(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.z_periodic, mpi_req);
+
+	// MPI-halo 
+	// ----------------------------------------------------------- //
+	// [Red] halo //
+	gs_init_halo_omp(x, rhs, idg, c_red, nx, ny, nz,
+		ib, ie,
+		jb, je,
+		kb, ke,
+		bc.p_west, bc.p_east, bc.p_south, bc.p_north, bc.p_bottom, bc.p_top);
+
+#pragma omp barrier
+
+	// [Black] halo //
+	gs_cycle_halo_omp(x, rhs, idg, c_black, nx, ny, nz,
+		ib + bc.p_west, ie - bc.p_east,
+		jb + bc.p_south, je - bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top,
+
+		bc.p_west, bc.p_east, bc.p_south, bc.p_north, bc.p_bottom, bc.p_top,
+
+		dx2i, dy2i,
+		dzp2i, dzm2i);
+
+#pragma omp barrier
+
+	// ----------------------------------------------------------- //
+}
+
+// * GS-RB main cycle iteration * //
+template< typename T >
+void poisson3d::gs_run_omp(
+	T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+
+	const int type, const int color_mode, const int piters,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc)
+{
+	const int niters = (type == c_gs_continue) ? piters : piters - 1;
+
+	const int c_red = color_mode;
+	const int c_black = !color_mode;
+
+	const int ib = gcx, ie = nx - gcx - 1;
+	const int jb = gcy, je = ny - gcy - 1;
+	const int kb = gcz, ke = nz - gcz - 1;
+	const int nsize = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);
+
+
+	// Special cases //
+	// ----------------------------------------------------------- //
+	if ((mpi_com.size == 1) ||				// single MPI processor GS //
+		(nsize <= c_small_poisson_size))	// small problem size GS //
+	{
+		for (int m = 0; m < niters; m++)
+		{
+			// boundary conditions [red (previous), black]
+			put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+				mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+				mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+				bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+			// ghost exchange & periodicity //
+			// exchange [black] [width=2, periodic=yes] //
+			if (mpi_com.size == 1) {
+				if (bc.x_periodic) {
+					nse::apply_periodic_x_omp(x, c_black, nx, ny, nz, gcx, gcy, gcz,
+						2, 2, 2);
+#pragma omp barrier
+				}
+				if (bc.y_periodic) {
+					nse::apply_periodic_y_omp(x, c_black, nx, ny, nz, gcx, gcy, gcz,
+						2, 2, 2);
+#pragma omp barrier
+				}
+				if (bc.z_periodic) {
+					nse::apply_periodic_z_omp(x, c_black, nx, ny, nz, gcx, gcy, gcz,
+						2, 2, 2);
+#pragma omp barrier
+				}
+			}
+			else
+			{
+				mpi_com.exchange_color_halo(x, c_black, nx, ny, nz, gcx, gcy, gcz,
+					2, 2, 2, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+			}
+
+			// [red] with shift = - 1 //
+			gs_cycle_omp(x, rhs, idg, c_red, nx, ny, nz,
+				ib - bc.p_west, ie + bc.p_east,
+				jb - bc.p_south, je + bc.p_north,
+				kb - bc.p_bottom, ke + bc.p_top,
+
+				dx2i, dy2i,
+				dzp2i, dzm2i);
+
+#pragma omp barrier
+
+			// [black] with shift = 0 //
+			gs_cycle_omp(x, rhs, idg, c_black, nx, ny, nz,
+				ib, ie,
+				jb, je,
+				kb, ke,
+
+				dx2i, dy2i,
+				dzp2i, dzm2i);
+
+			if (m < niters - 1) {
+#pragma omp barrier
+			}
+
+		}
+
+		return;
+	}
+	// ----------------------------------------------------------- //
+
+
+	// MPI-Async[x,y] GS //
+	// ----------------------------------------------------------- //
+	const int num_omp_threads = omp_get_num_threads();
+	MPI_Request mpi_req[4];
+
+	for (int m = 0; m < niters; m++)
+	{
+		// boundary conditions [red (previous), black]
+		put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+			mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+			mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+			bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+		// -x: push exchange [black] [width=2, periodic=yes] //
+		mpi_com.push_exchange_color_halo_x(x, c_black, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.x_periodic, mpi_req);
+
+#pragma omp master
+		{
+			if ((mpi_com.size_x > 1) && (num_omp_threads > 1)) {
+				MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+				for (int k = 0; k < 4; k++)
+					mpi_req[k] = MPI_REQUEST_NULL;
+			}
+		}
+
+		// [Red] main block //
+		gs_cycle_omp(x, rhs, idg, c_red, nx, ny, nz,
+			ib + bc.p_west, ie - bc.p_east,
+			jb + bc.p_south, je - bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+		// -x: pop exchange [black] [width=2, periodic=yes] //
+		mpi_com.pop_exchange_color_halo_x(x, c_black, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.x_periodic, mpi_req);
+
+		// -y: push exchange [black] [width=2, periodic=yes] //
+		mpi_com.push_exchange_color_halo_y(x, c_black, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.y_periodic, mpi_req);
+
+#pragma omp master
+		{
+			if ((mpi_com.size_y > 1) && (num_omp_threads > 1)) {
+				MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+				for (int k = 0; k < 4; k++)
+					mpi_req[k] = MPI_REQUEST_NULL;
+			}
+		}
+
+		// [Black] main block //
+		gs_cycle_omp(x, rhs, idg, c_black, nx, ny, nz,
+			ib + 2 * bc.p_west, ie - 2 * bc.p_east,
+			jb + 2 * bc.p_south, je - 2 * bc.p_north,
+			kb + 2 * bc.p_bottom, ke - 2 * bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+		// -y: pop exchange [black] [width=2, periodic=yes] //
+		mpi_com.pop_exchange_color_halo_y(x, c_black, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.y_periodic, mpi_req);
+
+		// -z: push exchange [black] [width=2, periodic=yes] //
+		mpi_com.push_exchange_color_halo_z(x, c_black, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.z_periodic, mpi_req);
+
+		// -z: pop exchange [black] [width=2, periodic=yes] //
+		mpi_com.pop_exchange_color_halo_z(x, c_black, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.z_periodic, mpi_req);
+
+		// MPI-halo 
+		// ----------------------------------------------------------- //
+		// [Red] halo //
+		gs_cycle_halo_omp(x, rhs, idg, c_red, nx, ny, nz,
+			ib + bc.p_west, ie - bc.p_east,
+			jb + bc.p_south, je - bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			2 * bc.p_west, 2 * bc.p_east,
+			2 * bc.p_south, 2 * bc.p_north,
+			2 * bc.p_bottom, 2 * bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+#pragma omp barrier
+
+		// [Black] halo //
+		gs_cycle_halo_omp(x, rhs, idg, c_black, nx, ny, nz,
+			ib + 2 * bc.p_west, ie - 2 * bc.p_east,
+			jb + 2 * bc.p_south, je - 2 * bc.p_north,
+			kb + 2 * bc.p_bottom, ke - 2 * bc.p_top,
+
+			2 * bc.p_west, 2 * bc.p_east,
+			2 * bc.p_south, 2 * bc.p_north,
+			2 * bc.p_bottom, 2 * bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+		if (m < niters - 1) {
+#pragma omp barrier
+		}
+
+	}
+	// ----------------------------------------------------------- //
+}
+
+// Initialization //
+// ------------------------------------------------------------------------ //
+
+// * initialize: GS-preconditioner * //
+template void poisson3d::gs_redblack_omp(
+	float* _RESTRICT x,
+	float* _RESTRICT rhs, const float* _RESTRICT const idg,
+	const int type, const int color_mode, const int piters,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+
+template void poisson3d::gs_redblack_omp(
+	double* _RESTRICT x,
+	double* _RESTRICT rhs, const double* _RESTRICT const idg,
+	const int type, const int color_mode, const int piters,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+// ------------------------------------------------------------------------ //
diff --git a/pois-gs3d.h b/pois-gs3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..6252ff759aa71c9a67fc72cf3da5b9e44b84b7ff
--- /dev/null
+++ b/pois-gs3d.h
@@ -0,0 +1,87 @@
+#pragma once
+
+// [pois-gs3d.h]: 3D Poisson Gauss-Seidel
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "mpi-com3d.h"
+#include "pois-def3d.h"
+
+
+namespace poisson3d
+{
+	// * GS modes * //
+	const int c_gs_init = 0;
+	const int c_gs_continue = 1;
+	// -------------------------------------------------------------------- //
+
+	// * GS Red-Black preconditioner * //
+	template< typename T >
+	void gs_redblack(T* _RESTRICT x,
+		T* _RESTRICT rhs, const T* _RESTRICT const idg,
+
+		const int type, const int color_mode, const int piters,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// OpenMP //
+namespace poisson3d
+{
+	// * GS Red-Black preconditioner * //
+	template< typename T >
+	void gs_redblack_omp(T* _RESTRICT x,
+		T* _RESTRICT rhs, const T* _RESTRICT const idg,
+
+		const int type, const int color_mode, const int piters,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+template< typename T >
+void poisson3d::gs_redblack(T* _RESTRICT x,
+	T* _RESTRICT rhs, const T* _RESTRICT const idg,
+
+	const int type, const int color_mode, const int piters,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc)
+{
+	if (omp_in_parallel()) {
+		gs_redblack_omp(x, rhs, idg, type, color_mode, piters,
+			nx, ny, nz, gcx, gcy, gcz,
+			dx2i, dy2i, dzp2i, dzm2i, mpi_com, bc);
+	}
+	else
+	{
+#pragma omp parallel shared( x, rhs )
+		{
+			gs_redblack_omp(x, rhs, idg, type, color_mode, piters,
+				nx, ny, nz, gcx, gcy, gcz,
+				dx2i, dy2i, dzp2i, dzm2i, mpi_com, bc);
+		}
+	}
+}
diff --git a/pois-mg-base3d.cpp b/pois-mg-base3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70725e4672133b7b2e20c85ae4a6b0443d6ae696
--- /dev/null
+++ b/pois-mg-base3d.cpp
@@ -0,0 +1,1961 @@
+#include "pois-mg-base3d.h"
+#include "grid-common3d.h"
+#include "pois-bc3d.h"
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+
+template< typename T >
+void poisson3d::mg_restrict_omp(	// MG Restrict: Fine -> Coarse
+	T* _RESTRICT coarse,
+	const T* _RESTRICT const fine,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce
+#ifdef USE_STRICT_MG
+	,
+	const T* _RESTRICT const dz, const T* _RESTRICT const cdzi
+#endif	
+)
+{
+	const int cnyz = cny * cnz;
+	const int nyz = ny * nz;
+
+	// ghost cell shifts //
+	const int gcx_sh = (
+		(type == nse::nse_const3d::axisX) || 
+		(type == nse::nse_const3d::axisXY) ||
+		(type == nse::nse_const3d::axisXZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((cgcx << 1) - gcx) : (cgcx - gcx);
+	const int gcy_sh = (
+		(type == nse::nse_const3d::axisY) || 
+		(type == nse::nse_const3d::axisXY) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((cgcy << 1) - gcy) : (cgcy - gcy);
+	const int gcz_sh = (
+		(type == nse::nse_const3d::axisZ) || 
+		(type == nse::nse_const3d::axisXZ) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((cgcz << 1) - gcz) : (cgcz - gcz);
+	const int gcxy_sh = gcx_sh * nyz + gcy_sh * nz;
+
+	// k starting index on fine grid //
+	const int kb = (
+		(type == nse::nse_const3d::axisZ) || 
+		(type == nse::nse_const3d::axisXZ) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((kcb << 1) - gcz_sh) : (kcb - gcz_sh);
+
+
+	int ic, jc, kc, cidx, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int chidx, shidx;
+#endif
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisXYZ) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx += 2)
+				{
+					coarse[cidx] = (T) 0.125 * (
+						fine[idx] + fine[idx + 1] +
+						fine[idx + nz] + fine[idx + nz + 1] +
+
+						fine[idx + nyz] + fine[idx + nyz + 1] +
+						fine[idx + nyz + nz] + fine[idx + nyz + nz + 1]
+						);
+				}
+			}
+		}
+
+		return;
+			}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisZ) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * nyz + jc * nz - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * nyz + jcb * nz - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += nz)
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx += 2)
+				{
+					coarse[cidx] = (T) 0.5 * (
+						fine[idx] + fine[idx + 1]);
+				}
+			}
+		}
+
+		return;
+	}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisY) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * nyz + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * nyz + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx++)
+				{
+					coarse[cidx] = (T) 0.5 * (
+						fine[idx] + fine[idx + nz]);
+				}
+			}
+		}
+
+		return;
+	}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisX) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * nz - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * nz - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += nz)
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx++)
+				{
+					coarse[cidx] = (T) 0.5 * (
+						fine[idx] + fine[idx + nyz]);
+				}
+			}
+		}
+
+		return;
+	}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisXY) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx++)
+				{
+					coarse[cidx] = (T) 0.25 * (
+						fine[idx] + fine[idx + nyz] +
+						fine[idx + nz] + fine[idx + nyz + nz]
+						);
+				}
+			}
+		}
+
+		return;
+			}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisXZ) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * nz - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * nz - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += nz)
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx += 2)
+				{
+					coarse[cidx] = (T) 0.25 * (
+						fine[idx] + fine[idx + nyz] +
+						fine[idx + 1] + fine[idx + nyz + 1]
+						);
+				}
+			}
+		}
+
+		return;
+			}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisYZ) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * nyz + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * nyz + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx += 2)
+				{
+					coarse[cidx] = (T) 0.25 * (
+						fine[idx] + fine[idx + nz] +
+						fine[idx + 1] + fine[idx + nz + 1]
+						);
+				}
+			}
+		}
+
+		return;
+			}
+		}
+// ------------------------------------------------------------------------ //
+
+
+template< typename T >
+void poisson3d::mg_restrict_residual_omp(	// MG Restrict Residual: Fine -> Coarse
+	T* _RESTRICT coarse,
+	const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i
+#ifdef USE_STRICT_MG
+	,
+	const T* _RESTRICT const dz, const T* _RESTRICT const cdzi
+#endif
+)
+{
+	const int cnyz = cny * cnz;
+	const int nyz = ny * nz;
+
+	// ghost cell shifts //
+	const int gcx_sh = (
+		(type == nse::nse_const3d::axisX) || 
+		(type == nse::nse_const3d::axisXY) ||
+		(type == nse::nse_const3d::axisXZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((cgcx << 1) - gcx) : (cgcx - gcx);
+	const int gcy_sh = (
+		(type == nse::nse_const3d::axisY) || 
+		(type == nse::nse_const3d::axisXY) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((cgcy << 1) - gcy) : (cgcy - gcy);
+	const int gcz_sh = (
+		(type == nse::nse_const3d::axisZ) || 
+		(type == nse::nse_const3d::axisXZ) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((cgcz << 1) - gcz) : (cgcz - gcz);
+	const int gcxy_sh = gcx_sh * nyz + gcy_sh * nz;
+
+	// k starting index on fine grid //
+	const int kb = (
+		(type == nse::nse_const3d::axisZ) || 
+		(type == nse::nse_const3d::axisXZ) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((kcb << 1) - gcz_sh) : (kcb - gcz_sh);
+
+	int ic, jc, kc, k, cidx, idx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx, chidx;
+#endif
+	T R_k, R_kp, Psum_k, Psum_kp;
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisXYZ) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+				k = kb;
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, k += 2, cidx++, idx += 2)
+				{
+					Psum_k = x[idx + nyz + nz] + x[idx + nyz] + x[idx + nz] + x[idx];
+					Psum_kp = x[idx + nyz + nz + 1] + x[idx + nyz + 1] + x[idx + nz + 1] + x[idx + 1];
+
+					R_k = (rhs[idx] + rhs[idx + nyz] + rhs[idx + nz] + rhs[idx + nyz + nz]) -
+
+						(x[idx + (nyz << 1) + nz] + x[idx + (nyz << 1)] + x[idx - nyz + nz] + x[idx - nyz] - Psum_k) * dx2i +
+						(x[idx + nyz + (nz << 1)] + x[idx + (nz << 1)] + x[idx + nyz - nz] + x[idx - nz] - Psum_k) * dy2i -
+
+						(Psum_kp - Psum_k) * dzp2i[k]
+						+
+						(Psum_k - (x[idx + nyz + nz - 1] + x[idx + nyz - 1] + x[idx + nz - 1] + x[idx - 1])) * dzm2i[k];
+
+					R_kp = (rhs[idx + 1] + rhs[idx + nyz + 1] + rhs[idx + nz + 1] + rhs[idx + nyz + nz + 1]) -
+
+						(x[idx + (nyz << 1) + nz + 1] + x[idx + (nyz << 1) + 1] + x[idx - nyz + nz + 1] + x[idx - nyz + 1] - Psum_kp) * dx2i -
+						(x[idx + nyz + (nz << 1) + 1] + x[idx + nyz - nz + 1] + x[idx + (nz << 1) + 1] + x[idx - nz + 1] - Psum_kp) * dy2i -
+
+						((x[idx + nyz + nz + 2] + x[idx + nyz + 2] + x[idx + nz + 2] + x[idx + 2]) - Psum_kp) * dzp2i[k + 1]
+						+
+						(Psum_kp - Psum_k) * dzm2i[k + 1];
+
+#ifndef USE_STRICT_MG
+					coarse[cidx] = (T) 0.125 * (R_k + R_kp);
+#else
+					coarse[cidx] = (T) 0.25 * cdzi[kc] * (R_k * dz[k] + R_kp * dz[k + 1]);
+#endif
+				}
+			}
+		}
+
+		return;
+			}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisZ) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * nyz + jc * nz - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * nyz + jcb * nz - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += nz)
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+				k = kb;
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, k += 2, cidx++, idx += 2)
+				{
+					R_k = rhs[idx] -
+						(x[idx + nyz] - x[idx] - x[idx] + x[idx - nyz]) * dx2i -
+						(x[idx + nz] - x[idx] - x[idx] + x[idx - nz]) * dy2i -
+
+						(x[idx + 1] - x[idx]) * dzp2i[k] +
+						(x[idx] - x[idx - 1]) * dzm2i[k];
+
+					R_kp = rhs[idx + 1] -
+						(x[idx + nyz + 1] - x[idx + 1] - x[idx + 1] + x[idx - nyz + 1]) * dx2i -
+						(x[idx + nz + 1] - x[idx + 1] - x[idx + 1] + x[idx - nz + 1]) * dy2i -
+
+						(x[idx + 2] - x[idx + 1]) * dzp2i[k + 1] +
+						(x[idx + 1] - x[idx]) * dzm2i[k + 1];
+
+#ifndef USE_STRICT_MG
+					coarse[cidx] = (T) 0.5 * (R_k + R_kp);
+#else
+					coarse[cidx] = cdzi[kc] * (R_k * dz[k] + R_kp * dz[k + 1]);
+#endif
+				}
+			}
+		}
+
+		return;
+			}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisY) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * nyz + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * nyz + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+				k = kb;
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, k++, cidx++, idx++)
+				{
+					coarse[cidx] = (T) 0.5 * (
+						(rhs[idx] + rhs[idx + nz]) -
+
+						(x[idx + nyz + nz] + x[idx - nyz + nz] + x[idx + nyz] + x[idx - nyz]
+							- x[idx + nz] - x[idx + nz] - x[idx] - x[idx]) * dx2i -
+
+							(x[idx + (nz << 1)] + x[idx - nz] - x[idx + nz] - x[idx]) * dy2i -
+
+						(x[idx + nz + 1] + x[idx + 1] - x[idx + nz] - x[idx]) * dzp2i[k]
+						+
+						(x[idx + nz] + x[idx] - x[idx + nz - 1] - x[idx - 1]) * dzm2i[k]);
+				}
+			}
+		}
+
+		return;
+			}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisX) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * nz - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * nz - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += nz)
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+				k = kb;
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, k++, cidx++, idx++)
+				{
+					coarse[cidx] = (T) 0.5 * (
+						(rhs[idx] + rhs[idx + nyz]) -
+
+						(x[idx + (nyz << 1)] + x[idx - nyz] - x[idx + nyz] - x[idx]) * dx2i -
+
+						(x[idx + nyz + nz] + x[idx + nyz - nz] + x[idx + nz] + x[idx - nz]
+						- x[idx + nyz] - x[idx + nyz] - x[idx] - x[idx]) * dy2i -
+
+						(x[idx + nyz + 1] + x[idx + 1] - x[idx + nyz] - x[idx]) * dzp2i[k]
+						+
+						(x[idx + nyz] + x[idx] - x[idx + nyz - 1] - x[idx - 1]) * dzm2i[k]);
+				}
+			}
+		}
+
+		return;
+	}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisXY) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+				k = kb;
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, k++, cidx++, idx++)
+				{
+					Psum_k = x[idx + nyz + nz] + x[idx + nyz] + x[idx + nz] + x[idx];
+
+					coarse[cidx] = (T) 0.25 * (
+						(rhs[idx] + rhs[idx + nyz] + rhs[idx + nz] + rhs[idx + nyz + nz]) -
+
+						(x[idx + (nyz << 1) + nz] + x[idx + (nyz << 1)] + x[idx - nyz + nz] + x[idx - nyz] - Psum_k) * dx2i +
+						(x[idx + nyz + (nz << 1)] + x[idx + (nz << 1)] + x[idx + nyz - nz] + x[idx - nz] - Psum_k) * dy2i -
+
+						(x[idx + nyz + nz + 1] + x[idx + nyz + 1] + x[idx + nz + 1] + x[idx + 1] - Psum_k) * dzp2i[k]
+						+
+						(Psum_k - x[idx + nyz + nz - 1] - x[idx + nyz - 1] - x[idx + nz - 1] - x[idx - 1]) * dzm2i[k]);
+				}
+			}
+		}
+
+		return;
+			}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisXZ) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * nz - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * nz - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += nz)
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+				k = kb;
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, k += 2, cidx++, idx += 2)
+				{
+					R_k = (rhs[idx] + rhs[idx + nyz]) -
+						(x[idx + (nyz << 1)] + x[idx - nyz] - x[idx + nyz] - x[idx]) * dx2i -
+
+						(x[idx + nyz + nz] + x[idx + nyz - nz] + x[idx + nz] + x[idx - nz]
+							- x[idx + nyz] - x[idx + nyz] - x[idx] - x[idx]) * dy2i -
+
+							(x[idx + nyz + 1] + x[idx + 1] - x[idx + nyz] - x[idx]) * dzp2i[k]
+						+
+						(x[idx + nyz] + x[idx] - x[idx + nyz - 1] - x[idx - 1]) * dzm2i[k];
+
+					R_kp = (rhs[idx + 1] + rhs[idx + nyz + 1]) -
+						(x[idx + (nyz << 1) + 1] + x[idx - nyz + 1] - x[idx + nyz + 1] - x[idx + 1]) * dx2i -
+
+						(x[idx + nyz + nz + 1] + x[idx + nyz - nz + 1] + x[idx + nz + 1] + x[idx - nz + 1]
+							- x[idx + nyz + 1] - x[idx + nyz + 1] - x[idx + 1] - x[idx + 1]) * dy2i -
+
+							(x[idx + nyz + 2] + x[idx + 2] - x[idx + nyz + 1] - x[idx + 1]) * dzp2i[k + 1]
+						+
+						(x[idx + nyz + 1] + x[idx + 1] - x[idx + nyz] - x[idx]) * dzm2i[k + 1];
+
+#ifndef USE_STRICT_MG
+					coarse[cidx] = (T) 0.25 * (R_k + R_kp);
+#else
+					coarse[cidx] = (T) 0.5 * cdzi[kc] * (R_k * dz[k] + R_kp * dz[k + 1]);
+#endif
+				}
+			}
+		}
+
+		return;
+			}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisYZ) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * nyz + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * nyz + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+				k = kb;
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, k += 2, cidx++, idx += 2)
+				{
+					R_k = (rhs[idx] + rhs[idx + nz]) -
+						(x[idx + nyz + nz] + x[idx - nyz + nz] + x[idx + nyz] + x[idx - nyz]
+							- x[idx + nz] - x[idx + nz] - x[idx] - x[idx]) * dx2i -
+
+							(x[idx + (nz << 1)] + x[idx - nz] - x[idx + nz] - x[idx]) * dy2i -
+
+						(x[idx + nz + 1] + x[idx + 1] - x[idx + nz] - x[idx]) * dzp2i[k]
+						+
+						(x[idx + nz] + x[idx] - x[idx + nz - 1] - x[idx - 1]) * dzm2i[k];
+
+					R_kp = (rhs[idx + 1] + rhs[idx + nz + 1]) -
+						(x[idx + nyz + nz + 1] + x[idx - nyz + nz + 1] + x[idx + nyz + 1] + x[idx - nyz + 1]
+							- x[idx + nz + 1] - x[idx + nz + 1] - x[idx + 1] - x[idx + 1]) * dx2i -
+
+							(x[idx + (nz << 1) + 1] + x[idx - nz + 1] - x[idx + nz + 1] - x[idx + 1]) * dy2i -
+
+						(x[idx + nz + 2] + x[idx + 2] - x[idx + nz + 1] - x[idx + 1]) * dzp2i[k + 1]
+						+
+						(x[idx + nz + 1] + x[idx + 1] - x[idx + nz] - x[idx]) * dzm2i[k + 1];
+
+#ifndef USE_STRICT_MG
+					coarse[cidx] = (T) 0.25 * (R_k + R_kp);
+#else
+					coarse[cidx] = (T) 0.5 * cdzi[kc] * (R_k * dz[k] + R_kp * dz[k + 1]);
+#endif
+				}
+			}
+		}
+
+		return;
+			}
+		}
+// ------------------------------------------------------------------------ //
+
+
+template< typename T >
+void poisson3d::mg_prolongate_omp(	// MG Prolongate: Coarse -> Fine
+	T* _RESTRICT fine,
+	const T* _RESTRICT const coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce
+#ifdef USE_STRICT_MG
+	,
+	const T* _RESTRICT const dz, const T* _RESTRICT const cdz,
+	const T* _RESTRICT const cdzpi, const T* _RESTRICT const cdzmi
+#endif
+)
+{
+	const int cnyz = cny * cnz;
+	const int nyz = ny * nz;
+
+	// ghost cell shifts //
+	const int gcx_sh = (
+		(type == nse::nse_const3d::axisX) || 
+		(type == nse::nse_const3d::axisXY) ||
+		(type == nse::nse_const3d::axisXZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((cgcx << 1) - gcx) : (cgcx - gcx);
+	const int gcy_sh = (
+		(type == nse::nse_const3d::axisY) || 
+		(type == nse::nse_const3d::axisXY) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((cgcy << 1) - gcy) : (cgcy - gcy);
+	const int gcz_sh = (
+		(type == nse::nse_const3d::axisZ) || 
+		(type == nse::nse_const3d::axisXZ) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((cgcz << 1) - gcz) : (cgcz - gcz);
+	const int gcxy_sh = gcx_sh * nyz + gcy_sh * nz;
+
+	// k starting index on fine grid //
+	const int kb = (
+		(type == nse::nse_const3d::axisZ) || 
+		(type == nse::nse_const3d::axisXZ) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? ((kcb << 1) - gcz_sh) : (kcb - gcz_sh);
+
+	int ic, jc, kc, k, idx, cidx;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx, chidx;
+#endif
+
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisXYZ) {
+		const T C1 = (T) 0.5625;	// 9.0 / 16.0
+		const T C2 = (T) 0.1875;	// 3.0 / 16.0
+		const T C3 = (T) 0.0625;	// 1.0 / 16.0
+
+		T C_ijk, C_ijkp, C_ijkm;
+		T C_ijpk, C_ijpkp, C_ijpkm;
+		T C_ipjk, C_ipjkp, C_ipjkm;
+		T C_ipjpk, C_ipjpkp, C_ipjpkm;
+#ifdef USE_STRICT_MG
+		T alpha_k, beta_k, alpha_kp, beta_kp;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse(2) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+				k = kb;
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, k += 2, cidx++, idx += 2)
+				{
+					C_ijk = C1 * coarse[cidx] + C2 * (coarse[cidx - cnyz] + coarse[cidx - cnz]) + C3 * coarse[cidx - cnyz - cnz];
+					C_ijkp = C1 * coarse[cidx + 1] + C2 * (coarse[cidx - cnyz + 1] + coarse[cidx - cnz + 1]) + C3 * coarse[cidx - cnyz - cnz + 1];
+					C_ijkm = C1 * coarse[cidx - 1] + C2 * (coarse[cidx - cnyz - 1] + coarse[cidx - cnz - 1]) + C3 * coarse[cidx - cnyz - cnz - 1];
+
+					C_ijpk = C1 * coarse[cidx] + C2 * (coarse[cidx - cnyz] + coarse[cidx + cnz]) + C3 * coarse[cidx - cnyz + cnz];
+					C_ijpkp = C1 * coarse[cidx + 1] + C2 * (coarse[cidx - cnyz + 1] + coarse[cidx + cnz + 1]) + C3 * coarse[cidx - cnyz + cnz + 1];
+					C_ijpkm = C1 * coarse[cidx - 1] + C2 * (coarse[cidx - cnyz - 1] + coarse[cidx + cnz - 1]) + C3 * coarse[cidx - cnyz + cnz - 1];
+
+					C_ipjk = C1 * coarse[cidx] + C2 * (coarse[cidx + cnyz] + coarse[cidx - cnz]) + C3 * coarse[cidx + cnyz - cnz];
+					C_ipjkp = C1 * coarse[cidx + 1] + C2 * (coarse[cidx + cnyz + 1] + coarse[cidx - cnz + 1]) + C3 * coarse[cidx + cnyz - cnz + 1];
+					C_ipjkm = C1 * coarse[cidx - 1] + C2 * (coarse[cidx + cnyz - 1] + coarse[cidx - cnz - 1]) + C3 * coarse[cidx + cnyz - cnz - 1];
+
+					C_ipjpk = C1 * coarse[cidx] + C2 * (coarse[cidx + cnyz] + coarse[cidx + cnz]) + C3 * coarse[cidx + cnyz + cnz];
+					C_ipjpkp = C1 * coarse[cidx + 1] + C2 * (coarse[cidx + cnyz + 1] + coarse[cidx + cnz + 1]) + C3 * coarse[cidx + cnyz + cnz + 1];
+					C_ipjpkm = C1 * coarse[cidx - 1] + C2 * (coarse[cidx + cnyz - 1] + coarse[cidx + cnz - 1]) + C3 * coarse[cidx + cnyz + cnz - 1];
+
+#ifdef USE_STRICT_MG
+					alpha_k = (cdz[kc - 1] + dz[k]) * cdzmi[kc];
+					beta_k = dz[k + 1] * cdzmi[kc];
+
+					alpha_kp = (cdz[kc + 1] + dz[k + 1]) * cdzpi[kc];
+					beta_kp = dz[k] * cdzpi[kc];
+
+					fine[idx] += alpha_k * C_ijk + beta_k * C_ijkm;
+					fine[idx + 1] += alpha_kp * C_ijk + beta_kp * C_ijkp;
+					fine[idx + nz] += alpha_k * C_ijpk + beta_k * C_ijpkm;
+					fine[idx + nz + 1] += alpha_kp * C_ijpk + beta_kp * C_ijpkp;
+					fine[idx + nyz] += alpha_k * C_ipjk + beta_k * C_ipjkm;
+					fine[idx + nyz + 1] += alpha_kp * C_ipjk + beta_kp * C_ipjkp;
+					fine[idx + nyz + nz] += alpha_k * C_ipjpk + beta_k * C_ipjpkm;
+					fine[idx + nyz + nz + 1] += alpha_kp * C_ipjpk + beta_kp * C_ipjpkp;
+#else
+					fine[idx] += (T) 0.75 * C_ijk + (T) 0.25 * C_ijkm;
+					fine[idx + 1] += (T) 0.75 * C_ijk + (T) 0.25 * C_ijkp;
+					fine[idx + nz] += (T) 0.75 * C_ijpk + (T) 0.25 * C_ijpkm;
+					fine[idx + nz + 1] += (T) 0.75 * C_ijpk + (T) 0.25 * C_ijpkp;
+					fine[idx + nyz] += (T) 0.75 * C_ipjk + (T) 0.25 * C_ipjkm;
+					fine[idx + nyz + 1] += (T) 0.75 * C_ipjk + (T) 0.25 * C_ipjkp;
+					fine[idx + nyz + nz] += (T) 0.75 * C_ipjpk + (T) 0.25 * C_ipjpkm;
+					fine[idx + nyz + nz + 1] += (T) 0.75 * C_ipjpk + (T) 0.25 * C_ipjpkp;
+#endif
+				}
+			}
+		}
+
+		return;
+	}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisZ) {
+#ifdef USE_STRICT_MG
+		T alpha_k, beta_k, alpha_kp, beta_kp;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * nyz + jc * nz - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * nyz + jcb * nz - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += nz)
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+				k = kb;
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, k += 2, cidx++, idx += 2)
+				{
+#ifdef USE_STRICT_MG
+					alpha_k = (cdz[kc - 1] + dz[k]) * cdzmi[kc];
+					beta_k = dz[k + 1] * cdzmi[kc];
+
+					alpha_kp = (cdz[kc + 1] + dz[k + 1]) * cdzpi[kc];
+					beta_kp = dz[k] * cdzpi[kc];
+
+					fine[idx] += alpha_k * coarse[cidx] + beta_k * coarse[cidx - 1];
+					fine[idx + 1] += alpha_kp * coarse[cidx] + beta_kp * coarse[cidx + 1];
+#else
+					fine[idx] += (T) 0.75 * coarse[cidx] + (T) 0.25 * coarse[cidx - 1];
+					fine[idx + 1] += (T) 0.75 * coarse[cidx] + (T) 0.25 * coarse[cidx + 1];
+#endif
+				}
+			}
+		}
+
+		return;
+	}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisY) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * nyz + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * nyz + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx++)
+				{
+					fine[idx] += coarse[cidx];
+					fine[idx + nz] += coarse[cidx];
+				}
+			}
+		}
+
+		return;
+	}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisX) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * nz - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * nz - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += nz)
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx++)
+				{
+					fine[idx] += coarse[cidx];
+					fine[idx + nyz] += coarse[cidx];
+				}
+			}
+		}
+
+		return;
+	}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisXY) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx++)
+				{
+					fine[idx] += coarse[cidx];
+					fine[idx + nz] += coarse[cidx];
+					fine[idx + nyz] += coarse[cidx];
+					fine[idx + nyz + nz] += coarse[cidx];
+				}
+			}
+		}
+
+		return;
+			}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisXZ) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * (nyz << 1) + jc * nz - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * (nyz << 1) + jcb * nz - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += nz)
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx += 2)
+				{
+					fine[idx] += coarse[cidx];
+					fine[idx + 1] += coarse[cidx];
+					fine[idx + nyz] += coarse[cidx];
+					fine[idx + nyz + 1] += coarse[cidx];
+				}
+			}
+		}
+
+		return;
+			}
+
+	// ----------------------------- //
+	if (type == nse::nse_const3d::axisYZ) {
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			for (jc = jcb; jc <= jce; jc++)
+			{
+				cidx = ic * cnyz + jc * cnz + kcb;
+				idx = ic * nyz + jc * (nz << 1) - gcxy_sh + kb;
+#else
+#pragma omp for nowait
+		for (ic = icb; ic <= ice; ic++)
+		{
+			chidx = ic * cnyz + jcb * cnz + kcb;
+			shidx = ic * nyz + jcb * (nz << 1) - gcxy_sh + kb;
+			for (jc = jcb; jc <= jce; jc++, chidx += cnz, shidx += (nz << 1))
+			{
+				cidx = chidx;
+				idx = shidx;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+				for (kc = kcb; kc <= kce; kc++, cidx++, idx += 2)
+				{
+					fine[idx] += coarse[cidx];
+					fine[idx + 1] += coarse[cidx];
+					fine[idx + nz] += coarse[cidx];
+					fine[idx + nz + 1] += coarse[cidx];
+				}
+			}
+		}
+
+		return;
+			}
+		}
+// ------------------------------------------------------------------------ //
+
+template< typename T >
+void poisson3d::mg_prolongate_omp(	// MG Prolongate: Coarse -> Fine
+	T* _RESTRICT fine,
+	const T* _RESTRICT const coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz
+#ifdef USE_STRICT_MG
+	,
+	const T* _RESTRICT const dz, const T* _RESTRICT const cdz,
+	const T* _RESTRICT const cdzpi, const T* _RESTRICT const cdzmi
+#endif
+)
+{
+	mg_prolongate_omp(fine, coarse, type,
+
+		nx, ny, nz, gcx, gcy, gcz,
+		cnx, cny, cnz, cgcx, cgcy, cgcz,
+
+		cgcx, cnx - cgcx - 1,
+		cgcy, cny - cgcy - 1,
+		cgcz, cnz - cgcz - 1
+#ifdef USE_STRICT_MG
+		,
+		dz, cdz,
+		cdzpi, cdzmi
+#endif
+	);
+}
+// ----------------------------------------------------------------- //
+
+
+// * [laplace-restrict-residual] for poisson equation with async exchanges * //
+template< typename T >
+void poisson3d::laplace_restrict_residual_omp(T* _RESTRICT res_coarse,
+	T* _RESTRICT x_fine, const T* _RESTRICT const rhs_fine,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i_fine, const T dy2i_fine,
+	const T* _RESTRICT const dzp2i_fine, const T* _RESTRICT const dzm2i_fine,
+
+#ifdef USE_STRICT_MG
+	const T* _RESTRICT const dz_fine, const T* _RESTRICT const dzi_coarse,
+#endif
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	const int icb = cgcx, ice = cnx - cgcx - 1;
+	const int jcb = cgcy, jce = cny - cgcy - 1;
+	const int kcb = cgcz, kce = cnz - cgcz - 1;
+
+	if (mpi_com.size == 1)	// single MPI processor restriction //
+	{
+		put_bc_omp(x_fine, nx, ny, nz, gcx, gcy, gcz,
+			mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+			mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+			bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+		if (bc.x_periodic) {
+			nse::apply_periodic_x_omp(x_fine, nx, ny, nz, gcx, gcy, gcz,
+				1, 0, 0);
+		}
+		if (bc.y_periodic) {
+			nse::apply_periodic_y_omp(x_fine, nx, ny, nz, gcx, gcy, gcz,
+				0, 1, 0);
+		}
+		if (bc.z_periodic) {
+			nse::apply_periodic_z_omp(x_fine, nx, ny, nz, gcx, gcy, gcz,
+				0, 0, 1);
+		}
+
+		if (bc.x_periodic || bc.y_periodic || bc.z_periodic)
+		{
+#pragma omp barrier
+		}
+
+		mg_restrict_residual_omp(res_coarse, x_fine, rhs_fine, type,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			nx, ny, nz, gcx, gcy, gcz,
+
+			icb, ice,
+			jcb, jce,
+			kcb, kce,
+
+			dx2i_fine, dy2i_fine, dzp2i_fine, dzm2i_fine
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dzi_coarse
+#endif
+		);
+
+		return;
+	}
+
+	MPI_Request mpi_req[12];
+
+	put_bc_omp(x_fine, nx, ny, nz, gcx, gcy, gcz,
+		mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+		mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+		bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+	mpi_com.push_exchange_cross_halo(x_fine, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.x_periodic, bc.y_periodic, bc.z_periodic, mpi_req);
+
+	mg_restrict_residual_omp(res_coarse, x_fine, rhs_fine, type,
+		cnx, cny, cnz, cgcx, cgcy, cgcz,
+		nx, ny, nz, gcx, gcy, gcz,
+
+		icb + bc.p_west, ice - bc.p_east,
+		jcb + bc.p_south, jce - bc.p_north,
+		kcb + bc.p_bottom, kce - bc.p_top,
+
+		dx2i_fine, dy2i_fine, dzp2i_fine, dzm2i_fine
+#ifdef USE_STRICT_MG
+		,
+		dz_fine, dzi_coarse
+#endif
+	);
+
+
+	// finalize -x cross exchanges //
+	mpi_com.pop_exchange_halo_x(x_fine, nx, ny, nz, gcx, gcy, gcz,
+		1, 0, 0, bc.x_periodic, &mpi_req[0]);
+
+	if (bc.p_west)
+		mg_restrict_residual_omp(res_coarse, x_fine, rhs_fine, type,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			nx, ny, nz, gcx, gcy, gcz,
+
+			icb, icb,
+			jcb + bc.p_south, jce - bc.p_north,
+			kcb + bc.p_bottom, kce - bc.p_top,
+
+			dx2i_fine, dy2i_fine, dzp2i_fine, dzm2i_fine
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dzi_coarse
+#endif
+		);
+
+	if (bc.p_east)
+		mg_restrict_residual_omp(res_coarse, x_fine, rhs_fine, type,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			nx, ny, nz, gcx, gcy, gcz,
+
+			ice, ice,
+			jcb + bc.p_south, jce - bc.p_north,
+			kcb + bc.p_bottom, kce - bc.p_top,
+
+			dx2i_fine, dy2i_fine, dzp2i_fine, dzm2i_fine
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dzi_coarse
+#endif
+		);
+
+
+	// finalize -y cross exchanges //
+	mpi_com.pop_exchange_halo_y(x_fine, nx, ny, nz, gcx, gcy, gcz,
+		0, 1, 0, bc.y_periodic, &mpi_req[4]);
+
+	if (bc.p_south)
+		mg_restrict_residual_omp(res_coarse, x_fine, rhs_fine, type,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			nx, ny, nz, gcx, gcy, gcz,
+
+			icb, ice,
+			jcb, jcb,
+			kcb + bc.p_bottom, kce - bc.p_top,
+
+			dx2i_fine, dy2i_fine, dzp2i_fine, dzm2i_fine
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dzi_coarse
+#endif
+		);
+
+	if (bc.p_north)
+		mg_restrict_residual_omp(res_coarse, x_fine, rhs_fine, type,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			nx, ny, nz, gcx, gcy, gcz,
+
+			icb, ice,
+			jce, jce,
+			kcb + bc.p_bottom, kce - bc.p_top,
+
+			dx2i_fine, dy2i_fine, dzp2i_fine, dzm2i_fine
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dzi_coarse
+#endif
+		);
+
+
+	// finalize -z cross exchanges //
+	mpi_com.pop_exchange_halo_z(x_fine, nx, ny, nz, gcx, gcy, gcz,
+		0, 0, 1, bc.z_periodic, &mpi_req[8]);
+
+	if (bc.p_bottom)
+		mg_restrict_residual_omp(res_coarse, x_fine, rhs_fine, type,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			nx, ny, nz, gcx, gcy, gcz,
+
+			icb, ice,
+			jcb, jce,
+			kcb, kcb,
+
+			dx2i_fine, dy2i_fine, dzp2i_fine, dzm2i_fine
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dzi_coarse
+#endif
+		);
+
+	if (bc.p_top)
+		mg_restrict_residual_omp(res_coarse, x_fine, rhs_fine, type,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			nx, ny, nz, gcx, gcy, gcz,
+
+			icb, ice,
+			jcb, jce,
+			kce, kce,
+
+			dx2i_fine, dy2i_fine, dzp2i_fine, dzm2i_fine
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dzi_coarse
+#endif
+		);
+
+}
+// ------------------------------------------------------------------------ //
+
+
+// * [laplace-prolongate] for poisson equation with async exchanges * //
+template< typename T >
+void poisson3d::laplace_prolongate_omp(T* _RESTRICT x_fine,
+	T* _RESTRICT x_coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+#ifdef USE_STRICT_MG
+	const T* _RESTRICT const dz_fine, const T* _RESTRICT const dz_coarse,
+	const T* _RESTRICT const dzpi_coarse, const T* _RESTRICT const dzmi_coarse,
+#endif
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	const int ib = cgcx, ie = cnx - cgcx - 1;
+	const int jb = cgcy, je = cny - cgcy - 1;
+	const int kb = cgcz, ke = cnz - cgcz - 1;
+
+
+	const int chx = (
+		(type == nse::nse_const3d::axisX) || 
+		(type == nse::nse_const3d::axisXY) ||
+		(type == nse::nse_const3d::axisXZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? 1 : 0;
+	const int chy = (
+		(type == nse::nse_const3d::axisY) || 
+		(type == nse::nse_const3d::axisXY) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? 1 : 0;
+	const int chz = (
+		(type == nse::nse_const3d::axisZ) || 
+		(type == nse::nse_const3d::axisXZ) ||
+		(type == nse::nse_const3d::axisYZ) || 
+		(type == nse::nse_const3d::axisXYZ)) ? 1 : 0;
+
+	// ----------------------------------------------------------- //
+	if (mpi_com.size == 1)	// single MPI processor prolongation //
+	{
+		put_bc_omp(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+			mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+			mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+			bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+		if ((bc.x_periodic) && (chx)) {
+			nse::apply_periodic_x_omp(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+				chx, chy, chz);
+#pragma omp barrier
+		}
+		if ((bc.y_periodic) && (chy)) {
+			nse::apply_periodic_y_omp(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+				chx, chy, chz);
+#pragma omp barrier
+		}
+		if ((bc.z_periodic) && (chz)) {
+			nse::apply_periodic_z_omp(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+				chx, chy, chz);
+#pragma omp barrier
+		}
+
+		mg_prolongate_omp(x_fine, x_coarse, type,
+			nx, ny, nz, gcx, gcy, gcz,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+
+			ib, ie,
+			jb, je,
+			kb, ke
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dz_coarse,
+			dzpi_coarse, dzmi_coarse
+#endif
+		);
+
+		return;
+	}
+
+	// MPI-Async prolongation //
+	// ----------------------------------------------------------- //
+	MPI_Request mpi_req[4];
+
+	put_bc_omp(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+		mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+		mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+		bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+	// push exchange [x_coarse] [width=1, periodic=yes] //
+	mpi_com.push_exchange_halo_x(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+		chx, chy, chz, bc.x_periodic, mpi_req);
+
+#pragma omp master
+	{
+		if (mpi_com.size_x > 1) {
+			MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+			for (int k = 0; k < 4; k++)
+				mpi_req[k] = MPI_REQUEST_NULL;
+		}
+	}
+
+	// main block //
+	mg_prolongate_omp(x_fine, x_coarse, type,
+		nx, ny, nz, gcx, gcy, gcz,
+		cnx, cny, cnz, cgcx, cgcy, cgcz,
+
+		ib + bc.p_west, ie - bc.p_east,
+		jb + bc.p_south, je - bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top
+#ifdef USE_STRICT_MG
+		,
+		dz_fine, dz_coarse,
+		dzpi_coarse, dzmi_coarse
+#endif
+	);
+
+	// pop exchange [x_coarse] [width=1, periodic=yes] //
+	mpi_com.pop_exchange_halo_x(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+		chx, chy, chz, bc.x_periodic, mpi_req);
+
+
+	// push exchange [x_coarse] [width=1, periodic=yes] //
+	mpi_com.push_exchange_halo_y(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+		chx, chy, chz, bc.y_periodic, mpi_req);
+
+	// pop exchange [x_coarse] [width=1, periodic=yes] //
+	mpi_com.pop_exchange_halo_y(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+		chx, chy, chz, bc.y_periodic, mpi_req);
+
+	// push exchange [x_coarse] [width=1, periodic=yes] //
+	mpi_com.push_exchange_halo_z(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+		chx, chy, chz, bc.z_periodic, mpi_req);
+
+	// pop exchange [x_coarse] [width=1, periodic=yes] //
+	mpi_com.pop_exchange_halo_z(x_coarse, cnx, cny, cnz, cgcx, cgcy, cgcz,
+		chx, chy, chz, bc.z_periodic, mpi_req);
+
+	// west strip //
+	if (bc.p_west) {
+		mg_prolongate_omp(x_fine, x_coarse, type,
+			nx, ny, nz, gcx, gcy, gcz,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			ib, ib + bc.p_west - 1,
+			jb, je,
+			kb, ke
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dz_coarse,
+			dzpi_coarse, dzmi_coarse
+#endif
+		);
+	}
+	// east strip //
+	if (bc.p_east) {
+		mg_prolongate_omp(x_fine, x_coarse, type,
+			nx, ny, nz, gcx, gcy, gcz,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			ie - bc.p_east + 1, ie,
+			jb, je,
+			kb, ke
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dz_coarse,
+			dzpi_coarse, dzmi_coarse
+#endif
+		);
+	}
+	// south strip //
+	if (bc.p_south) {
+		mg_prolongate_omp(x_fine, x_coarse, type,
+			nx, ny, nz, gcx, gcy, gcz,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			ib + bc.p_west, ie - bc.p_east,
+			jb, jb + bc.p_south - 1,
+			kb, ke
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dz_coarse,
+			dzpi_coarse, dzmi_coarse
+#endif
+		);
+	}
+	// north strip //
+	if (bc.p_north) {
+		mg_prolongate_omp(x_fine, x_coarse, type,
+			nx, ny, nz, gcx, gcy, gcz,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			ib + bc.p_west, ie - bc.p_east,
+			je - bc.p_north + 1, je,
+			kb, ke
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dz_coarse,
+			dzpi_coarse, dzmi_coarse
+#endif
+		);
+	}
+	// bottom strip //
+	if (bc.p_bottom) {
+		mg_prolongate_omp(x_fine, x_coarse, type,
+			nx, ny, nz, gcx, gcy, gcz,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			ib + bc.p_west, ie - bc.p_east,
+			jb + bc.p_south, je - bc.p_north,
+			kb, kb + bc.p_bottom - 1
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dz_coarse,
+			dzpi_coarse, dzmi_coarse
+#endif
+		);
+	}
+	// top strip //
+	if (bc.p_top) {
+		mg_prolongate_omp(x_fine, x_coarse, type,
+			nx, ny, nz, gcx, gcy, gcz,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			ib + bc.p_west, ie - bc.p_east,
+			jb + bc.p_south, je - bc.p_north,
+			ke - bc.p_top + 1, ke
+#ifdef USE_STRICT_MG
+			,
+			dz_fine, dz_coarse,
+			dzpi_coarse, dzmi_coarse
+#endif
+		);
+	}
+}
+// ------------------------------------------------------------------------ //
+
+
+// Initialization //
+// ------------------------------------------------------------------------ //
+
+// * initialize: restriction * //
+template void poisson3d::mg_restrict_omp(float* _RESTRICT coarse,
+	const float* _RESTRICT const fine,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce
+#ifdef USE_STRICT_MG
+	,
+	const float* _RESTRICT const dz, const float* _RESTRICT const cdzi
+#endif
+);
+
+template void poisson3d::mg_restrict_omp(double* _RESTRICT coarse,
+	const double* _RESTRICT const fine,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce
+#ifdef USE_STRICT_MG
+	,
+	const double* _RESTRICT const dz, const double* _RESTRICT const cdzi
+#endif
+);
+// ----------------------------------------------------------------- //
+
+
+// * initialize: residual restriction * //
+template void poisson3d::mg_restrict_residual_omp(float* _RESTRICT coarse,
+	const float* _RESTRICT const x, const float* _RESTRICT const rhs,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce,
+
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i
+#ifdef USE_STRICT_MG
+	,
+	const float* _RESTRICT const dz, const float* _RESTRICT const cdzi
+#endif
+);
+
+template void poisson3d::mg_restrict_residual_omp(double* _RESTRICT coarse,
+	const double* _RESTRICT const x, const double* _RESTRICT const rhs,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce,
+
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i
+#ifdef USE_STRICT_MG
+	,
+	const double* _RESTRICT const dz, const double* _RESTRICT const cdzi
+#endif
+);
+// ----------------------------------------------------------------- //
+
+
+// * initialize: prolongation * //
+template void poisson3d::mg_prolongate_omp(float* _RESTRICT fine,
+	const float* _RESTRICT const coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce
+#ifdef USE_STRICT_MG
+	,
+	const float* _RESTRICT const dz, const float* _RESTRICT const cdz,
+	const float* _RESTRICT const cdzpi, const float* _RESTRICT const cdzmi
+#endif
+);
+
+template void poisson3d::mg_prolongate_omp(double* _RESTRICT fine,
+	const double* _RESTRICT const coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce
+#ifdef USE_STRICT_MG
+	,
+	const double* _RESTRICT const dz, const double* _RESTRICT const cdz,
+	const double* _RESTRICT const cdzpi, const double* _RESTRICT const cdzmi
+#endif
+);
+// ----------------------------------------------------------------- //
+
+
+// * initialize: domain prolongation * //
+template void poisson3d::mg_prolongate_omp(float* _RESTRICT fine,
+	const float* _RESTRICT const coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz
+#ifdef USE_STRICT_MG
+	,
+	const float* _RESTRICT const dz, const float* _RESTRICT const cdz,
+	const float* _RESTRICT const cdzpi, const float* _RESTRICT const cdzmi
+#endif
+);
+
+template void poisson3d::mg_prolongate_omp(double* _RESTRICT fine,
+	const double* _RESTRICT const coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz
+#ifdef USE_STRICT_MG
+	,
+	const double* _RESTRICT const dz, const double* _RESTRICT const cdz,
+	const double* _RESTRICT const cdzpi, const double* _RESTRICT const cdzmi
+#endif
+);
+// ----------------------------------------------------------------- //
+
+
+// * initialize: Restriction: includes MPI * //
+template void poisson3d::laplace_restrict_residual_omp(float* _RESTRICT y_coarse,
+	float* _RESTRICT x_fine, const float* _RESTRICT const rhs_fine,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const float dx2i_fine, const float dy2i_fine,
+	const float* _RESTRICT const dzp2i_fine, const float* _RESTRICT const dzm2i_fine,
+
+#ifdef USE_STRICT_MG
+	const float* _RESTRICT const dz_fine, const float* _RESTRICT const dzi_coarse,
+#endif
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc
+);
+
+template void poisson3d::laplace_restrict_residual_omp(double* _RESTRICT y_coarse,
+	double* _RESTRICT x_fine, const double* _RESTRICT const rhs_fine,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const double dx2i_fine, const double dy2i_fine,
+	const double* _RESTRICT const dzp2i_fine, const double* _RESTRICT const dzm2i_fine,
+
+#ifdef USE_STRICT_MG
+	const double* _RESTRICT const dz_fine, const double* _RESTRICT const dzi_coarse,
+#endif
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+// ------------------------------------------------------------------------ //
+
+
+// * initialize: Prolongation: includes MPI * //
+template void poisson3d::laplace_prolongate_omp(float* _RESTRICT x_fine,
+	float* _RESTRICT x_coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+#ifdef USE_STRICT_MG
+	const float* _RESTRICT const dz_fine, const float* _RESTRICT const dz_coarse,
+	const float* _RESTRICT const dzpi_coarse, const float* _RESTRICT const dzmi_coarse,
+#endif
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc
+);
+template void poisson3d::laplace_prolongate_omp(double* _RESTRICT x_fine,
+	double* _RESTRICT x_coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+#ifdef USE_STRICT_MG
+	const double* _RESTRICT const dz_fine, const double* _RESTRICT const dz_coarse,
+	const double* _RESTRICT const dzpi_coarse, const double* _RESTRICT const dzmi_coarse,
+#endif
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc
+);
+// ------------------------------------------------------------------------ //
diff --git a/pois-mg-base3d.h b/pois-mg-base3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a76c9ef2ccec1327131e2006c23044f0b796389
--- /dev/null
+++ b/pois-mg-base3d.h
@@ -0,0 +1,592 @@
+#pragma once
+
+// [pois-mg-base3d.h]: 3D Poisson Multigrid base components
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "grid3d.h"			// using nse_const3d::axisType //
+#include "mpi-com3d.h"
+#include "pois-def3d.h"
+#include "mg-def3d.h"		// using MG defintions //
+
+
+namespace poisson3d
+{
+	// * Restriction * //
+	template< typename T >
+	void mg_restrict(T* _RESTRICT coarse,
+		const T* _RESTRICT const fine,
+		const nse::nse_const3d::axisType type,
+
+		const int cnx, const int cny, const int cnz,		// coarse grid //
+		const int cgcx, const int cgcy, const int cgcz,
+
+		const int nx, const int ny, const int nz,			// fine grid //
+		const int gcx, const int gcy, const int gcz,
+
+		const int icb, const int ice,
+		const int jcb, const int jce,
+		const int kcb, const int kce
+#ifdef USE_STRICT_MG
+		,
+		const T* _RESTRICT const dz, const T* _RESTRICT const cdzi
+#endif
+	);
+	// ----------------------------------------------------------------- //
+
+
+	// * Residual Restriction * //
+	template< typename T >
+	void mg_restrict_residual(T* _RESTRICT coarse,
+		const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+		const nse::nse_const3d::axisType type,
+
+		const int cnx, const int cny, const int cnz,		// coarse grid //
+		const int cgcx, const int cgcy, const int cgcz,
+
+		const int nx, const int ny, const int nz,			// fine grid //
+		const int gcx, const int gcy, const int gcz,
+
+		const int icb, const int ice,
+		const int jcb, const int jce,
+		const int kcb, const int kce,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i
+#ifdef USE_STRICT_MG
+		,
+		const T* _RESTRICT const dz, const T* _RESTRICT const cdzi
+#endif
+	);
+	// ----------------------------------------------------------------- //
+
+
+	// * Prolongation [no extension inside] * //
+	template< typename T >
+	void mg_prolongate(T* _RESTRICT fine,
+		const T* _RESTRICT const coarse,
+		const nse::nse_const3d::axisType type,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const int cnx, const int cny, const int cnz,
+		const int cgcx, const int cgcy, const int cgcz,
+
+		const int icb, const int ice,
+		const int jcb, const int jce,
+		const int kcb, const int kce
+#ifdef USE_STRICT_MG
+		,
+		const T* _RESTRICT const dz, const T* _RESTRICT const cdz,
+		const T* _RESTRICT const cdzpi, const T* _RESTRICT const cdzmi
+#endif
+	);
+
+	template< typename T >
+	void mg_prolongate(T* _RESTRICT fine,
+		const T* _RESTRICT const coarse,
+		const nse::nse_const3d::axisType type,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const int cnx, const int cny, const int cnz,
+		const int cgcx, const int cgcy, const int cgcz
+#ifdef USE_STRICT_MG
+		,
+		const T* _RESTRICT const dz, const T* _RESTRICT const cdz,
+		const T* _RESTRICT const cdzpi, const T* _RESTRICT const cdzmi
+#endif
+	);
+	// ----------------------------------------------------------------- //
+
+
+	// Residual Restriction: includes MPI //
+	template< typename T >
+	void laplace_restrict_residual(T* _RESTRICT y_coarse,
+		T* _RESTRICT x_fine, const T* _RESTRICT const rhs_fine,
+		const nse::nse_const3d::axisType type,
+
+		const int cnx, const int cny, const int cnz,
+		const int cgcx, const int cgcy, const int cgcz,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i_fine, const T dy2i_fine,
+		const T* _RESTRICT const dzp2i_fine, const T* _RESTRICT const dzm2i_fine,
+
+#ifdef USE_STRICT_MG
+		const T* _RESTRICT const dz_fine, const T* _RESTRICT const dzi_coarse,
+#endif
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+
+
+	// Solution Prolongation: includes MPI //
+	template< typename T >
+	void laplace_prolongate(T* _RESTRICT x_fine,
+		T* _RESTRICT x_coarse,
+		const nse::nse_const3d::axisType type,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const int cnx, const int cny, const int cnz,
+		const int cgcx, const int cgcy, const int cgcz,
+
+#ifdef USE_STRICT_MG
+		const T* _RESTRICT const dz_fine, const T* _RESTRICT const dz_coarse,
+		const T* _RESTRICT const dzpi_coarse, const T* _RESTRICT const dzmi_coarse,
+#endif
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+
+// OpenMP //
+namespace poisson3d
+{
+	// * Restriction * //
+	template< typename T >
+	void mg_restrict_omp(T* _RESTRICT coarse,
+		const T* _RESTRICT const fine,
+		const nse::nse_const3d::axisType type,
+
+		const int cnx, const int cny, const int cnz,		// coarse grid //
+		const int cgcx, const int cgcy, const int cgcz,
+
+		const int nx, const int ny, const int nz,			// fine grid //
+		const int gcx, const int gcy, const int gcz,
+
+		const int icb, const int ice,
+		const int jcb, const int jce,
+		const int kcb, const int kce
+#ifdef USE_STRICT_MG
+		,
+		const T* _RESTRICT const dz, const T* _RESTRICT const cdzi
+#endif
+	);
+	// ----------------------------------------------------------------- //
+
+
+	// * Residual Restriction * //
+	template< typename T >
+	void mg_restrict_residual_omp(T* _RESTRICT coarse,
+		const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+		const nse::nse_const3d::axisType type,
+
+		const int cnx, const int cny, const int cnz,		// coarse grid //
+		const int cgcx, const int cgcy, const int cgcz,
+
+		const int nx, const int ny, const int nz,			// fine grid //
+		const int gcx, const int gcy, const int gcz,
+
+		const int icb, const int ice,
+		const int jcb, const int jce,
+		const int kcb, const int kce,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i
+#ifdef USE_STRICT_MG
+		,
+		const T* _RESTRICT const dz, const T* _RESTRICT const cdzi
+#endif
+	);
+	// ----------------------------------------------------------------- //
+
+
+	// * Prolongation [no extension inside] * //
+	template< typename T >
+	void mg_prolongate_omp(T* _RESTRICT fine,
+		const T* _RESTRICT const coarse,
+		const nse::nse_const3d::axisType type,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const int cnx, const int cny, const int cnz,
+		const int cgcx, const int cgcy, const int cgcz,
+
+		const int icb, const int ice,
+		const int jcb, const int jce,
+		const int kcb, const int kce
+#ifdef USE_STRICT_MG
+		,
+		const T* _RESTRICT const dz, const T* _RESTRICT const cdz,
+		const T* _RESTRICT const cdzpi, const T* _RESTRICT const cdzmi
+#endif
+	);
+
+
+	template< typename T >
+	void mg_prolongate_omp(T* _RESTRICT fine,
+		const T* _RESTRICT const coarse,
+		const nse::nse_const3d::axisType type,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const int cnx, const int cny, const int cnz,
+		const int cgcx, const int cgcy, const int cgcz
+#ifdef USE_STRICT_MG
+		,
+		const T* _RESTRICT const dz, const T* _RESTRICT const cdz,
+		const T* _RESTRICT const cdzpi, const T* _RESTRICT const cdzmi
+#endif
+	);
+	// ----------------------------------------------------------------- //
+
+
+	// Residual Restriction: includes MPI //
+	template< typename T >
+	void laplace_restrict_residual_omp(T* _RESTRICT y_coarse,
+		T* _RESTRICT x_fine, const T* _RESTRICT const rhs_fine,
+		const nse::nse_const3d::axisType type,
+
+		const int cnx, const int cny, const int cnz,
+		const int cgcx, const int cgcy, const int cgcz,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i_fine, const T dy2i_fine,
+		const T* _RESTRICT const dzp2i_fine, const T* _RESTRICT const dzm2i_fine,
+
+#ifdef USE_STRICT_MG
+		const T* _RESTRICT const dz_fine, const T* _RESTRICT const dzi_coarse,
+#endif
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+
+
+	// Solution Prolongation: includes MPI //
+	template< typename T >
+	void laplace_prolongate_omp(T* _RESTRICT x_fine,
+		T* _RESTRICT x_coarse,
+		const nse::nse_const3d::axisType type,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const int cnx, const int cny, const int cnz,
+		const int cgcx, const int cgcy, const int cgcz,
+
+#ifdef USE_STRICT_MG
+		const T* _RESTRICT const dz_fine, const T* _RESTRICT const dz_coarse,
+		const T* _RESTRICT const dzpi_coarse, const T* _RESTRICT const dzmi_coarse,
+#endif
+		const nse::mpiCom3d& mpi_com,
+		const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::mg_restrict(	// MG Restrict: Fine -> Coarse
+	T* _RESTRICT coarse,
+	const T* _RESTRICT const fine,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce
+#ifdef USE_STRICT_MG
+	,
+	const T* _RESTRICT const dz, const T* _RESTRICT const cdzi
+#endif	
+)
+{
+	if (omp_in_parallel()) {
+		mg_restrict_omp(coarse, fine, type,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			nx, ny, nz, gcx, gcy, gcz,
+
+			icb, ice, jcb, jce, kcb, kce
+#ifdef USE_STRICT_MG
+			,
+			dz, cdzi
+#endif
+		);
+	}
+	else
+	{
+#pragma omp parallel shared( coarse )
+		{
+			mg_restrict_omp(coarse, fine, type,
+				cnx, cny, cnz, cgcx, cgcy, cgcz,
+				nx, ny, nz, gcx, gcy, gcz,
+
+				icb, ice, jcb, jce, kcb, kce
+#ifdef USE_STRICT_MG
+				,
+				dz, cdzi
+#endif
+			);
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+template< typename T >
+inline void poisson3d::mg_restrict_residual(	// MG Restrict Residual: Fine -> Coarse
+	T* _RESTRICT coarse,
+	const T* _RESTRICT const x, const T* _RESTRICT const rhs,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i
+#ifdef USE_STRICT_MG
+	,
+	const T* _RESTRICT const dz, const T* _RESTRICT const cdzi
+#endif
+)
+{
+	if (omp_in_parallel()) {
+		mg_restrict_residual_omp(coarse, x, rhs, type,
+
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			nx, ny, nz, gcx, gcy, gcz,
+
+			icb, ice, jcb, jce, kcb, kce,
+			dx2i, dy2i, dzp2i, dzm2i
+#ifdef USE_STRICT_MG
+			,
+			dz, cdzi
+#endif
+		);
+	}
+	else
+	{
+#pragma omp parallel shared( coarse )
+		{
+			mg_restrict_residual_omp(coarse, x, rhs, type,
+
+				cnx, cny, cnz, cgcx, cgcy, cgcz,
+				nx, ny, nz, gcx, gcy, gcz,
+
+				icb, ice, jcb, jce, kcb, kce,
+				dx2i, dy2i, dzp2i, dzm2i
+#ifdef USE_STRICT_MG
+				,
+				dz, cdzi
+#endif
+			);
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+template< typename T >
+inline void poisson3d::mg_prolongate(	// MG Prolongate: Coarse -> Fine
+	T* _RESTRICT fine,
+	const T* _RESTRICT const coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int icb, const int ice,
+	const int jcb, const int jce,
+	const int kcb, const int kce
+#ifdef USE_STRICT_MG
+	,
+	const T* _RESTRICT const dz, const T* _RESTRICT const cdz,
+	const T* _RESTRICT const cdzpi, const T* _RESTRICT const cdzmi
+#endif
+)
+{
+	if (omp_in_parallel()) {
+		mg_prolongate_omp(fine, coarse, type,
+			nx, ny, nz, gcx, gcy, gcz,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+
+			icb, ice, jcb, jce, kcb, kce
+#ifdef USE_STRICT_MG
+			,
+			dz, cdz, cdzpi, cdzmi
+#endif
+		);
+	}
+	else
+	{
+#pragma omp parallel shared( fine )
+		{
+			mg_prolongate_omp(fine, coarse, type,
+				nx, ny, nz, gcx, gcy, gcz,
+				cnx, cny, cnz, cgcx, cgcy, cgcz,
+
+				icb, ice, jcb, jce, kcb, kce
+#ifdef USE_STRICT_MG
+				,
+				dz, cdz, cdzpi, cdzmi
+#endif
+			);
+		}
+	}
+}
+// ------------------------------------------------------------------------ //
+
+template< typename T >
+inline void poisson3d::mg_prolongate(	// MG Prolongate: Coarse -> Fine
+	T* _RESTRICT fine,
+	const T* _RESTRICT const coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz
+#ifdef USE_STRICT_MG
+	,
+	const T* _RESTRICT const dz, const T* _RESTRICT const cdz,
+	const T* _RESTRICT const cdzpi, const T* _RESTRICT const cdzmi
+#endif
+)
+{
+	mg_prolongate(fine, coarse, type,
+
+		nx, ny, nz, gcx, gcy, gcz,
+		cnx, cny, cnz, cgcx, cgcy, cgcz,
+
+		cgcx, cnx - cgcx - 1,
+		cgcy, cny - cgcy - 1,
+		cgcz, cnz - cgcz - 1
+#ifdef USE_STRICT_MG
+		,
+		dz, cdz,
+		cdzpi, cdzmi
+#endif
+	);
+}
+// ----------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::laplace_restrict_residual(	// Restrict Residual: includes MPI
+	T* _RESTRICT res_coarse,
+	T* _RESTRICT x_fine, const T* _RESTRICT const rhs_fine,
+	const nse::nse_const3d::axisType type,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i_fine, const T dy2i_fine,
+	const T* _RESTRICT const dzp2i_fine, const T* _RESTRICT const dzm2i_fine,
+
+#ifdef USE_STRICT_MG
+	const T* _RESTRICT const dz_fine, const T* _RESTRICT const dzi_coarse,
+#endif
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	if (omp_in_parallel()) {
+		laplace_restrict_residual_omp(res_coarse, x_fine, rhs_fine, type,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+			nx, ny, nz, gcx, gcy, gcz,
+
+			dx2i_fine, dy2i_fine, dzp2i_fine, dzm2i_fine,
+
+#ifdef USE_STRICT_MG
+			dz_fine, dzi_coarse,
+#endif
+			mpi_com, bc);
+	}
+	else
+	{
+#pragma omp parallel shared( res_coarse, x_fine ) 
+		{
+			laplace_restrict_residual_omp(res_coarse, x_fine, rhs_fine, type,
+				cnx, cny, cnz, cgcx, cgcy, cgcz,
+				nx, ny, nz, gcx, gcy, gcz,
+
+				dx2i_fine, dy2i_fine, dzp2i_fine, dzm2i_fine,
+
+#ifdef USE_STRICT_MG
+				dz_fine, dzi_coarse,
+#endif
+				mpi_com, bc);
+		}
+	}
+}
+// ----------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::laplace_prolongate(	// Prolongate Solution: includes MPI
+	T* _RESTRICT x_fine,
+	T* _RESTRICT x_coarse,
+	const nse::nse_const3d::axisType type,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const int cnx, const int cny, const int cnz,
+	const int cgcx, const int cgcy, const int cgcz,
+
+#ifdef USE_STRICT_MG
+	const T* _RESTRICT const dz_fine, const T* _RESTRICT const dz_coarse,
+	const T* _RESTRICT const dzpi_coarse, const T* _RESTRICT const dzmi_coarse,
+#endif
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	if (omp_in_parallel()) {
+		laplace_prolongate_omp(x_fine, x_coarse, type,
+			nx, ny, nz, gcx, gcy, gcz,
+			cnx, cny, cnz, cgcx, cgcy, cgcz,
+
+#ifdef USE_STRICT_MG
+			dz_fine, dz_coarse, dzpi_coarse, dzmi_coarse,
+#endif
+
+			mpi_com, bc);
+	}
+	else
+	{
+#pragma omp parallel shared( x_fine, x_coarse ) 
+		{
+			laplace_prolongate_omp(x_fine, x_coarse, type,
+				nx, ny, nz, gcx, gcy, gcz,
+				cnx, cny, cnz, cgcx, cgcy, cgcz,
+
+#ifdef USE_STRICT_MG
+				dz_fine, dz_coarse, dzpi_coarse, dzmi_coarse,
+#endif
+
+				mpi_com, bc);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
diff --git a/pois-mg3d.cpp b/pois-mg3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bbe74db7cae8246eef5fae7602188cef2edfa7af
--- /dev/null
+++ b/pois-mg3d.cpp
@@ -0,0 +1,471 @@
+#include "pois-mg3d.h"
+#include "grid-common3d.h"
+#include "pois-bc3d.h"
+#include "pois-mg-base3d.h"
+#include "pois-gs3d.h"
+#include "pois-sor3d.h"
+
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+// * MG[GS-SOR-RB] preconditioner * //
+template< typename T >
+void poisson3d::mg_sor_redblack_omp(
+	T* x, T* rhs,
+	const int piters,
+	nse::mg_poisson3d_data< T >& mg,
+	const nse::mpiCom3d& mpi_com)
+{
+#ifdef MEASURE_MG_RUN_TIME
+	double mark_time;
+#endif
+
+	const int fine = 0;
+	int i, k;
+
+#pragma omp master
+	{
+		mg.x[fine] = x;
+		mg.rhs[fine] = rhs;
+	}
+
+#pragma omp barrier
+
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+	mark_time = nse::timer_init();
+#endif
+
+	gs_redblack_omp(mg.x[fine], mg.rhs[fine], mg.idg[fine],
+		c_gs_init, mg.sm_color_shift[fine], mg.sm_down_iters[fine],
+		mg.nx[fine], mg.ny[fine], mg.nz[fine],
+		mg.gcx[fine], mg.gcy[fine], mg.gcz[fine],
+		mg.dx2i[fine], mg.dy2i[fine],
+		mg.dzp2i[fine], mg.dzm2i[fine],
+		mpi_com, mg.bc);
+
+#pragma omp barrier
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+	nse::timer_update(mark_time, &mg.run_time[fine], &mg.smooth_time[fine]);
+#endif
+
+	for (k = 0; k < piters; k++)
+	{
+		for (i = fine + 1; i < mg.num_grids; i++) {
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			mark_time = nse::timer_init();
+#endif
+
+			// MPI: cross exchange inside //
+			// MPI: no corner cells exchanges which could be needed for odd number of grid cells!
+			laplace_restrict_residual_omp(mg.rhs[i], mg.x[i - 1], mg.rhs[i - 1],
+				mg.coarse_type[i],
+
+				mg.nx[i], mg.ny[i], mg.nz[i],
+				mg.gcx[i], mg.gcy[i], mg.gcz[i],
+				mg.nx[i - 1], mg.ny[i - 1], mg.nz[i - 1],
+				mg.gcx[i - 1], mg.gcy[i - 1], mg.gcz[i - 1],
+
+				mg.dx2i[i - 1], mg.dy2i[i - 1],
+				mg.dzp2i[i - 1], mg.dzm2i[i - 1],
+
+#ifdef USE_STRICT_MG
+				mg.dz[i - 1], mg.dzi[i],
+#endif
+				mpi_com, mg.bc);
+
+#pragma omp barrier
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			nse::timer_update(mark_time,
+				&mg.run_time[i - 1], &mg.restrict_time[i - 1]);
+#endif
+
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			mark_time = nse::timer_init();
+#endif
+
+			gs_redblack_omp(mg.x[i], mg.rhs[i], mg.idg[i],
+				c_gs_init, mg.sm_color_shift[i], mg.sm_down_iters[i],
+				mg.nx[i], mg.ny[i], mg.nz[i],
+				mg.gcx[i], mg.gcy[i], mg.gcz[i],
+				mg.dx2i[i], mg.dy2i[i],
+				mg.dzp2i[i], mg.dzm2i[i],
+				mpi_com, mg.bc);
+
+#pragma omp barrier
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			nse::timer_update(mark_time,
+				&mg.run_time[i], &mg.smooth_time[i]);
+#endif
+		}
+
+		for (i = mg.num_grids - 2; i >= fine; i--) {
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			mark_time = nse::timer_init();
+#endif
+
+			// prolongate extension is not included for odd cells //
+			laplace_prolongate_omp(mg.x[i], mg.x[i + 1],
+				mg.coarse_type[i + 1],
+
+				mg.nx[i], mg.ny[i], mg.nz[i],
+				mg.gcx[i], mg.gcy[i], mg.gcz[i],
+				mg.nx[i + 1], mg.ny[i + 1], mg.nz[i + 1],
+				mg.gcx[i + 1], mg.gcy[i + 1], mg.gcz[i + 1],
+
+#ifdef USE_STRICT_MG
+				mg.dz[i], mg.dz[i + 1],
+				mg.dzpi[i + 1], mg.dzmi[i + 1],
+#endif
+				mpi_com, mg.bc);
+
+#pragma omp barrier
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			nse::timer_update(mark_time,
+				&mg.run_time[i + 1], &mg.prolongate_time[i + 1]);
+#endif
+
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			mark_time = nse::timer_init();
+#endif
+
+			sor_redblack_omp(mg.x[i], mg.rhs[i], mg.idg[i], mg.sm_up_omega[i],
+				c_sor_continue, mg.sm_color_shift[i], mg.sm_up_iters[i],
+				mg.nx[i], mg.ny[i], mg.nz[i],
+				mg.gcx[i], mg.gcy[i], mg.gcz[i],
+				mg.dx2i[i], mg.dy2i[i],
+				mg.dzp2i[i], mg.dzm2i[i],
+				mpi_com, mg.bc);
+
+#pragma omp barrier
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			nse::timer_update(mark_time,
+				&mg.run_time[i], &mg.smooth_time[i]);
+#endif
+		}
+	}
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+	mark_time = nse::timer_init();
+#endif
+
+	nse::null_ghost_halo_omp(mg.rhs[fine],
+		mg.nx[fine], mg.ny[fine], mg.nz[fine],
+		mg.gcx[fine], mg.gcy[fine], mg.gcz[fine]);
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+	nse::timer_update(mark_time, &mg.run_time[fine]);
+#endif
+}
+// ------------------------------------------------------------------------ //
+
+
+// * MG-MPI[GS-SOR-RB] preconditioner * //
+template< typename T >
+void poisson3d::mg_sor_redblack_omp(
+	T* x, T* rhs, const int piters,
+	nse::mg_mpi_poisson3d_data< T >& mg)
+{
+#ifdef MEASURE_MG_RUN_TIME
+	double mark_time;
+#endif
+
+	const int fine = 0;
+	int i, k;
+
+#pragma omp master
+	{
+		mg.x[fine] = x;
+		mg.rhs[fine] = rhs;
+	}
+
+#pragma omp barrier
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+	mark_time = nse::timer_init();
+#endif
+
+	// do fine grid on all mpi processors //
+	gs_redblack_omp(mg.x[fine], mg.rhs[fine], mg.idg[fine],
+		c_gs_init, mg.sm_color_shift[fine], mg.sm_down_iters[fine],
+		mg.mpi_nx[fine], mg.mpi_ny[fine], mg.mpi_nz[fine],
+		mg.gcx[fine], mg.gcy[fine], mg.gcz[fine],
+		mg.dx2i[fine], mg.dy2i[fine],
+		mg.dzp2i[fine], mg.dzm2i[fine],
+		mg.mpi_com[fine], mg.bc[fine]);
+
+#pragma omp barrier
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+	nse::timer_update(mark_time,
+		&mg.run_time[fine], &mg.smooth_time[fine]);
+#endif
+
+	for (k = 0; k < piters; k++)
+	{
+		for (i = fine + 1; i < mg.num_grids; i++) {
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			mark_time = nse::timer_init();
+#endif
+			// MPI RUN( on finer grid )
+			if (mg.mpi_run[i - 1]) {
+
+				// MPI: cross exchange inside //
+				// MPI: no corner cells exchanges which could be needed for odd number of grid cells!
+				laplace_restrict_residual_omp(mg.rhs[i], mg.x[i - 1], mg.rhs[i - 1],
+					mg.coarse_type[i],
+
+					mg.local_nx[i], mg.local_ny[i], mg.local_nz[i],
+					mg.gcx[i], mg.gcy[i], mg.gcz[i],
+					mg.mpi_nx[i - 1], mg.mpi_ny[i - 1], mg.mpi_nz[i - 1],
+					mg.gcx[i - 1], mg.gcy[i - 1], mg.gcz[i - 1],
+
+					mg.dx2i[i - 1], mg.dy2i[i - 1],
+					mg.dzp2i[i - 1], mg.dzm2i[i - 1],
+
+#ifdef USE_STRICT_MG
+					mg.dz[i - 1], mg.dzi[i],
+#endif
+					mg.mpi_com[i - 1], mg.bc[i - 1]);
+
+#pragma omp barrier
+			}
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			nse::timer_update(mark_time,
+				&mg.run_time[i - 1], &mg.restrict_time[i - 1]);
+#endif
+
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			mark_time = nse::timer_init();
+#endif
+
+			// MPI COMBINE( on coarser grid ) - GATHER
+			if (mg.mpi_combine[i]) {
+
+				// use communicator on finer grid: mg.mpi_com[ i - 1 ] //
+				mg.mpi_com[i - 1].gather_subgrid(mg.rhs[i], mg.rhs[i],
+					mg.mpi_nx[i], mg.mpi_ny[i], mg.mpi_nz[i],
+					mg.local_nx[i], mg.local_ny[i], mg.local_nz[i],
+					mg.gcx[i], mg.gcy[i], mg.gcz[i]);
+
+#pragma omp barrier
+			}
+
+			// MPI RUN( on coarser grid )
+			if (mg.mpi_run[i]) {
+				gs_redblack_omp(mg.x[i], mg.rhs[i], mg.idg[i],
+					c_gs_init, mg.sm_color_shift[i], mg.sm_down_iters[i],
+					mg.mpi_nx[i], mg.mpi_ny[i], mg.mpi_nz[i],
+					mg.gcx[i], mg.gcy[i], mg.gcz[i],
+					mg.dx2i[i], mg.dy2i[i],
+					mg.dzp2i[i], mg.dzm2i[i],
+					mg.mpi_com[i], mg.bc[i]);
+
+#pragma omp barrier
+			}
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			nse::timer_update(mark_time,
+				&mg.run_time[i], &mg.smooth_time[i]);
+#endif
+		}
+
+
+		for (i = mg.num_grids - 2; i >= fine; i--) {
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			mark_time = nse::timer_init();
+#endif
+
+			// MPI COMBINE( on coarser grid ) - SCATTER
+			if (mg.mpi_combine[i + 1]) {
+				// MPI COMBINE leads to mpi_run[i] != mpi_run[i+1]
+
+				// MPI RUN( on coarser grid )
+				if (mg.mpi_run[i + 1]) {
+
+					// prolongation requires mpi-exchanges with corners //
+					mg.mpi_com[i + 1].exchange_halo(mg.x[i + 1],
+						mg.mpi_nx[i + 1], mg.mpi_ny[i + 1], mg.mpi_nz[i + 1],
+						mg.gcx[i + 1], mg.gcy[i + 1], mg.gcz[i + 1],
+						1, 1, 1,
+						mg.bc[i + 1].x_periodic, mg.bc[i + 1].y_periodic, mg.bc[i + 1].z_periodic);
+
+					put_bc_omp(mg.x[i + 1],
+						mg.mpi_nx[i + 1], mg.mpi_ny[i + 1], mg.mpi_nz[i + 1],
+						mg.gcx[i + 1], mg.gcy[i + 1], mg.gcz[i + 1],
+						mg.mpi_com[i + 1].rank_x, mg.mpi_com[i + 1].rank_y, mg.mpi_com[i + 1].rank_z,
+						mg.mpi_com[i + 1].size_x, mg.mpi_com[i + 1].size_y, mg.mpi_com[i + 1].size_z,
+						mg.bc[i + 1].type,
+						mg.bc[i + 1].x_periodic, mg.bc[i + 1].y_periodic, mg.bc[i + 1].z_periodic);
+					// -------------------------------------------------- //
+
+#pragma omp barrier
+				}
+
+				// use communicator on finer grid: mg.mpi_com[ i ] //
+				mg.mpi_com[i].scatter_subgrid(mg.x[i + 1], mg.x[i + 1],
+					mg.mpi_nx[i + 1], mg.mpi_ny[i + 1], mg.mpi_nz[i + 1],
+					mg.local_nx[i + 1], mg.local_ny[i + 1], mg.local_nz[i + 1],
+					mg.gcx[i + 1], mg.gcy[i + 1], mg.gcz[i + 1]);
+
+
+#pragma omp barrier
+
+
+				// MPI RUN( on finer grid )
+				if (mg.mpi_run[i]) {
+
+					mg_prolongate_omp(mg.x[i], mg.x[i + 1],
+						mg.coarse_type[i + 1],
+
+						mg.mpi_nx[i], mg.mpi_ny[i], mg.mpi_nz[i],
+						mg.gcx[i], mg.gcy[i], mg.gcz[i],
+						mg.local_nx[i + 1], mg.local_ny[i + 1], mg.local_nz[i + 1],
+						mg.gcx[i + 1], mg.gcy[i + 1], mg.gcz[i + 1]
+
+#ifdef USE_STRICT_MG
+						,
+						mg.dz[i], mg.dz[i + 1],
+						mg.dzpi[i + 1], mg.dzmi[i + 1]
+#endif
+					);
+
+#pragma omp barrier
+				}
+			}
+			else
+			{
+				// !MPI COMBINE leads to:
+				//		mpi_run[i] == mpi_run[i+1]
+				//		mpi_n[x,y,z][i+1] == local_n[x,y,z][i+1]
+
+				// MPI RUN( on finer-coarse grids )
+				if ((mg.mpi_run[i]) && (mg.mpi_run[i + 1])) {
+
+					laplace_prolongate_omp(mg.x[i], mg.x[i + 1],
+						mg.coarse_type[i + 1],
+
+						mg.mpi_nx[i], mg.mpi_ny[i], mg.mpi_nz[i],
+						mg.gcx[i], mg.gcy[i], mg.gcz[i],
+						mg.local_nx[i + 1], mg.local_ny[i + 1], mg.local_nz[i + 1],
+						mg.gcx[i + 1], mg.gcy[i + 1], mg.gcz[i + 1],
+
+#ifdef USE_STRICT_MG
+						mg.dz[i], mg.dz[i + 1],
+						mg.dzpi[i + 1], mg.dzmi[i + 1],
+#endif
+						mg.mpi_com[i + 1],
+						mg.bc[i + 1]);
+
+#pragma omp barrier
+				}
+			}
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			nse::timer_update(mark_time,
+				&mg.run_time[i + 1], &mg.prolongate_time[i + 1]);
+#endif
+
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			mark_time = nse::timer_init();
+#endif
+
+
+			// MPI RUN( on finer grid )
+			if (mg.mpi_run[i]) {
+
+				sor_redblack_omp(mg.x[i], mg.rhs[i], mg.idg[i], mg.sm_up_omega[i],
+					c_sor_continue, mg.sm_color_shift[i], mg.sm_up_iters[i],
+					mg.mpi_nx[i], mg.mpi_ny[i], mg.mpi_nz[i],
+					mg.gcx[i], mg.gcy[i], mg.gcz[i],
+					mg.dx2i[i], mg.dy2i[i],
+					mg.dzp2i[i], mg.dzm2i[i],
+					mg.mpi_com[i], mg.bc[i]);
+
+#pragma omp barrier
+			}
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+			nse::timer_update(mark_time,
+				&mg.run_time[i], &mg.smooth_time[i]);
+#endif
+		}
+
+	}
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+	mark_time = nse::timer_init();
+#endif
+
+	nse::null_ghost_halo_omp(mg.rhs[fine],
+		mg.mpi_nx[fine], mg.mpi_ny[fine], mg.mpi_nz[fine],
+		mg.gcx[fine], mg.gcy[fine], mg.gcz[fine]);
+
+#ifdef MEASURE_MG_RUN_TIME
+#pragma omp master
+	nse::timer_update(mark_time, &mg.run_time[fine]);
+#endif
+}
+// ------------------------------------------------------------------------ //
+
+
+// Initialization //
+// ------------------------------------------------------------------------ //
+
+// * initialize: MG-preconditioner * //
+template void poisson3d::mg_sor_redblack_omp(
+	float* x, float* rhs, const int piters,
+	nse::mg_poisson3d_data< float >& mg, const nse::mpiCom3d& mpi_com);
+
+template void poisson3d::mg_sor_redblack_omp(
+	double* x, double* rhs, const int piters,
+	nse::mg_poisson3d_data< double >& mg, const nse::mpiCom3d& mpi_com);
+// ------------------------------------------------------------------------ //
+
+// * initialize: MPI-MG-preconditioner * //
+template void poisson3d::mg_sor_redblack_omp(
+	float* x, float* rhs, const int piters,
+	nse::mg_mpi_poisson3d_data< float >& mg);
+
+template void poisson3d::mg_sor_redblack_omp(
+	double* x, double* rhs, const int piters,
+	nse::mg_mpi_poisson3d_data< double >& mg);
+// ------------------------------------------------------------------------ //
diff --git a/pois-mg3d.h b/pois-mg3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..23caa57a0883113186199a425a8a5e16be1399e3
--- /dev/null
+++ b/pois-mg3d.h
@@ -0,0 +1,83 @@
+#pragma once
+
+// [pois-mg3d.h]: 3D Poisson Multigrid
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "mpi-com3d.h"
+#include "pois-def3d.h"
+#include "mg-data3d.h"
+
+
+namespace poisson3d
+{
+	// * MG[GS-SOR-RB] preconditioner * // 
+	template< typename T >
+	void mg_sor_redblack(T* x, T* rhs, const int piters,
+		nse::mg_poisson3d_data< T >& mg_data,
+		const nse::mpiCom3d& mpi_com);
+
+	template< typename T >
+	void mg_sor_redblack(T* x, T* rhs, const int piters,
+		nse::mg_mpi_poisson3d_data< T >& mg_data);
+	// -------------------------------------------------------------------- //
+}
+
+// OpenMP //
+namespace poisson3d
+{
+	// * MG[GS-SOR-RB] preconditioner * // 
+	template< typename T >
+	void mg_sor_redblack_omp(T* x, T* rhs,
+		const int piters,
+		nse::mg_poisson3d_data< T >& mg_data,
+		const nse::mpiCom3d& mpi_com);
+
+	template< typename T >
+	void mg_sor_redblack_omp(T* x, T* rhs,
+		const int piters,
+		nse::mg_mpi_poisson3d_data< T >& mg_data);
+	// -------------------------------------------------------------------- //
+}
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::mg_sor_redblack(
+	T* x, T* rhs,
+	const int piters,
+	nse::mg_poisson3d_data< T >& mg_data,
+	const nse::mpiCom3d& mpi_com)
+{
+	if (omp_in_parallel()) {
+		mg_sor_redblack_omp(x, rhs, piters, mg_data, mpi_com);
+	}
+	else
+	{
+#pragma omp parallel shared( x, rhs )
+		{
+			mg_sor_redblack_omp(x, rhs, piters, mg_data, mpi_com);
+		}
+	}
+}
+
+template< typename T >
+inline void poisson3d::mg_sor_redblack(
+	T* x, T* rhs,
+	const int piters,
+	nse::mg_mpi_poisson3d_data< T >& mg_data)
+{
+	if (omp_in_parallel()) {
+		mg_sor_redblack_omp(x, rhs, piters, mg_data);
+	}
+	else
+	{
+#pragma omp parallel shared( x, rhs )
+		{
+			mg_sor_redblack_omp(x, rhs, piters, mg_data);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
diff --git a/pois-setup.h b/pois-setup.h
new file mode 100644
index 0000000000000000000000000000000000000000..e03dfe9bfc13479e1675fee75d298f604300663c
--- /dev/null
+++ b/pois-setup.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// [pois-setup.h]: Poisson solver definitions
+//
+// -------------------------------------------------------------------------------------------- //
+
+namespace nse
+{
+	// * poisson solver setup * //
+	enum poissonInitMode {
+		isInitNull = 0,		// initial x = 0
+		isInitPreset = 1,	// initial x boundary & ghost cells are prescribed at input
+		isInitFree = 2		// no assumptions
+	};
+
+	enum poissonNormMode {	// poisson error norm
+		isNormC = 0,		// C-max norm for error checking
+		isNormL2 = 1		// L2 norm
+	};
+}
+// -------------------------------------------------------------------- //
diff --git a/pois-setup3d.h b/pois-setup3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..16d2ff493f9e873d448c6f77d6032bd4f61eed46
--- /dev/null
+++ b/pois-setup3d.h
@@ -0,0 +1,95 @@
+#pragma once
+
+// [pois-setup3d.h]: 3D Poisson solver definitions & boundary conditions constants
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "pois-setup.h"
+
+
+namespace nse
+{
+	// * boundary condition constants * //
+	enum poissonTypeBC {
+		westExt = 0, eastExt = 1,
+		southExt = 2, northExt = 3,
+		bottomExt = 4, topExt = 5,
+
+		periodicXY = 6, periodicXZ = 7, periodicYZ = 8,
+		periodicX = 9, periodicY = 10, periodicZ = 11,
+
+		neumann = 12,
+		periodicY_eastExt = 13
+	};
+
+	template< typename T >
+	struct poisOpt3d {
+
+		T retol, abstol;
+		int miniters, maxiters;
+
+		poissonInitMode init_mode;
+		poissonNormMode norm_mode;
+		poissonTypeBC bc_type;
+
+		int piters;		// optional ?
+
+		poisOpt3d() {}
+		poisOpt3d(
+			const T _retol, const T _abstol,
+			const int _miniters, const int _maxiters,
+			const poissonInitMode _init_mode,
+			const poissonNormMode _norm_mode,
+			const poissonTypeBC _bc_type,
+			const int _piters) :
+
+			retol(_retol), abstol(_abstol),
+			miniters(_miniters), maxiters(_maxiters),
+			init_mode(_init_mode), norm_mode(_norm_mode),
+			bc_type(_bc_type),
+			piters(_piters)
+		{
+		}
+		~poisOpt3d() {}
+	};
+
+
+	struct poisson_dynamic_bc {
+		poissonTypeBC type;
+
+		int x_periodic, y_periodic, z_periodic;
+
+		int p_west, p_east;
+		int p_south, p_north;
+		int p_bottom, p_top;
+
+		void init(const poissonTypeBC _type,
+			const int rank_x, const int rank_y, const int rank_z,
+			const int size_x, const int size_y, const int size_z)
+		{
+			type = _type;
+
+			x_periodic =
+				((type == periodicX) ||
+				(type == periodicXY) ||
+					(type == periodicXZ));
+			y_periodic =
+				((type == periodicY) ||
+				(type == periodicXY) ||
+					(type == periodicYZ) ||
+					(type == periodicY_eastExt));
+			z_periodic =
+				((type == periodicZ) ||
+				(type == periodicXZ) ||
+					(type == periodicYZ));
+
+			p_west = ((rank_x > 0) || x_periodic) ? 1 : 0;
+			p_east = ((rank_x < size_x - 1) || x_periodic) ? 1 : 0;
+			p_south = ((rank_y > 0) || y_periodic) ? 1 : 0;
+			p_north = ((rank_y < size_y - 1) || y_periodic) ? 1 : 0;
+			p_bottom = ((rank_z > 0) || z_periodic) ? 1 : 0;
+			p_top = ((rank_z < size_z - 1) || z_periodic) ? 1 : 0;
+		}
+	};
+}
+// -------------------------------------------------------------------- //
diff --git a/pois-sor-base3d.h b/pois-sor-base3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4076c8d81275b125899cd086ff3aeac18548117
--- /dev/null
+++ b/pois-sor-base3d.h
@@ -0,0 +1,783 @@
+#pragma once
+
+// [pois-sor-base3d.h]: 3D Poisson SOR base components
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+
+
+namespace poisson3d
+{
+	// * SOR init * //
+	template< typename T >
+	void sor_init(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke);
+
+	// * SOR cycle init * //
+	template< typename T >
+	void sor_cycle_init(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	// * SOR cycle * //
+	template< typename T >
+	void sor_cycle(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+
+	// * SOR init [Halo] * //
+	template< typename T >
+	void sor_init_halo(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const int p_west, const int p_east,
+		const int p_south, const int p_north,
+		const int p_bottom, const int p_top);
+
+	// * SOR cycle init [Halo] * //
+	template< typename T >
+	void sor_cycle_init_halo(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const int p_west, const int p_east,
+		const int p_south, const int p_north,
+		const int p_bottom, const int p_top,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	// * SOR color cycle [Halo] * //
+	template< typename T >
+	void sor_cycle_halo(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const int p_west, const int p_east,
+		const int p_south, const int p_north,
+		const int p_bottom, const int p_top,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+}
+
+// OpenMP //
+namespace poisson3d
+{
+	// * SOR init * //
+	template< typename T >
+	void sor_init_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke);
+
+	// * SOR cycle init * //
+	template< typename T >
+	void sor_cycle_init_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	// * SOR cycle * //
+	template< typename T >
+	void sor_cycle_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+
+
+	// * SOR init [Halo] * //
+	template< typename T >
+	void sor_init_halo_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const int p_west, const int p_east,
+		const int p_south, const int p_north,
+		const int p_bottom, const int p_top);
+
+	// * SOR cycle init [Halo] * //
+	template< typename T >
+	void sor_cycle_init_halo_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const int p_west, const int p_east,
+		const int p_south, const int p_north,
+		const int p_bottom, const int p_top,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+
+	// * SOR color cycle [Halo] * //
+	template< typename T >
+	void sor_cycle_halo_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+		const T omega,
+		const int color,
+		const int nx, const int ny, const int nz,
+		const int ib, const int ie,
+		const int jb, const int je,
+		const int kb, const int ke,
+
+		const int p_west, const int p_east,
+		const int p_south, const int p_north,
+		const int p_bottom, const int p_top,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i);
+	// -------------------------------------------------------------------- //
+}
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_init(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke)
+{
+	if (omp_in_parallel()) {
+		sor_init_omp(x, rhs, idg, omega, color,
+			nx, ny, nz, ib, ie, jb, je, kb, ke);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			sor_init_omp(x, rhs, idg, omega, color,
+				nx, ny, nz, ib, ie, jb, je, kb, ke);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_cycle_init(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		sor_cycle_init_omp(x, rhs, idg, omega, color,
+			nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			sor_cycle_init_omp(x, rhs, idg, omega, color,
+				nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_cycle(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		sor_cycle_omp(x, rhs, idg, omega, color,
+			nx, ny, nz, ib, ie, jb, je, kb, ke,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			sor_cycle_omp(x, rhs, idg, omega, color,
+				nx, ny, nz, ib, ie, jb, je, kb, ke,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_init_halo(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const int p_west, const int p_east,
+	const int p_south, const int p_north,
+	const int p_bottom, const int p_top)
+{
+	if (omp_in_parallel()) {
+		sor_init_halo_omp(x, rhs, idg, omega, color,
+			nx, ny, nz, ib, ie, jb, je, kb, ke,
+			p_west, p_east, p_south, p_north, p_bottom, p_top);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			sor_init_halo_omp(x, rhs, idg, omega, color,
+				nx, ny, nz, ib, ie, jb, je, kb, ke,
+				p_west, p_east, p_south, p_north, p_bottom, p_top);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_cycle_init_halo(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const int p_west, const int p_east,
+	const int p_south, const int p_north,
+	const int p_bottom, const int p_top,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		sor_cycle_init_halo_omp(x, rhs, idg, omega, color,
+			nx, ny, nz, ib, ie, jb, je, kb, ke,
+			p_west, p_east, p_south, p_north, p_bottom, p_top,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			sor_cycle_init_halo_omp(x, rhs, idg, omega, color,
+				nx, ny, nz, ib, ie, jb, je, kb, ke,
+				p_west, p_east, p_south, p_north, p_bottom, p_top,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_cycle_halo(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const int p_west, const int p_east,
+	const int p_south, const int p_north,
+	const int p_bottom, const int p_top,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	if (omp_in_parallel()) {
+		sor_cycle_halo_omp(x, rhs, idg, omega, color,
+			nx, ny, nz, ib, ie, jb, je, kb, ke,
+			p_west, p_east, p_south, p_north, p_bottom, p_top,
+			dx2i, dy2i, dzp2i, dzm2i);
+	}
+	else
+	{
+#pragma omp parallel shared( x ) 
+		{
+			sor_cycle_halo_omp(x, rhs, idg, omega, color,
+				nx, ny, nz, ib, ie, jb, je, kb, ke,
+				p_west, p_east, p_south, p_north, p_bottom, p_top,
+				dx2i, dy2i, dzp2i, dzm2i);
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+// * OpenMP versions * // 
+// -------------------------------------------------------------------- //
+template< typename T >
+inline void poisson3d::sor_init_omp(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx, shc;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			shc = ((i + j + kb + color) & 1);
+			idx = i * nyz + j * nz + kb + shc;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		shc = ((i + jb + kb + color) & 1);
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx + shc;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+			for (k = kb + shc; k <= ke; k += 2, idx += 2) {
+				x[idx] = omega * idg[k] * rhs[idx];
+			}
+#ifndef USE_OPENMP_2D_CYCLE
+			shc = !shc;
+#endif
+		}
+	}
+		}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_cycle_init_omp(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx, shc;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			shc = ((i + j + kb + color) & 1);
+			idx = i * nyz + j * nz + kb + shc;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		shc = ((i + jb + kb + color) & 1);
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx + shc;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+			for (k = kb + shc; k <= ke; k += 2, idx += 2) {
+				x[idx] = omega * idg[k] * (rhs[idx] - (
+					(x[idx + nyz] + x[idx - nyz]) * dx2i +
+					(x[idx + nz] + x[idx - nz]) * dy2i +
+					(x[idx + 1] * dzp2i[k] + x[idx - 1] * dzm2i[k])));
+			}
+#ifndef USE_OPENMP_2D_CYCLE
+			shc = !shc;
+#endif
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_cycle_omp(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	const int nyz = ny * nz;
+	int i, j, k, idx, shc;
+#ifndef USE_OPENMP_2D_CYCLE
+	int shidx;
+#endif
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 ) nowait
+	for (i = ib; i <= ie; i++)
+	{
+		for (j = jb; j <= je; j++)
+		{
+			shc = ((i + j + kb + color) & 1);
+			idx = i * nyz + j * nz + kb + shc;
+#else
+#pragma omp for nowait
+	for (i = ib; i <= ie; i++)
+	{
+		shidx = i * nyz + jb * nz + kb;
+		shc = ((i + jb + kb + color) & 1);
+		for (j = jb; j <= je; j++, shidx += nz)
+		{
+			idx = shidx + shc;
+#endif
+
+#ifdef USE_OPENMP_SIMD
+#pragma omp simd
+#endif
+#ifdef USE_INTEL_SIMD
+#pragma simd
+#endif
+			for (k = kb + shc; k <= ke; k += 2, idx += 2) {
+
+				x[idx] += -omega * x[idx] +
+					omega * idg[k] * (rhs[idx] - (
+					(x[idx + nyz] + x[idx - nyz]) * dx2i +
+						(x[idx + nz] + x[idx - nz]) * dy2i +
+						(x[idx + 1] * dzp2i[k] + x[idx - 1] * dzm2i[k])));
+			}
+#ifndef USE_OPENMP_2D_CYCLE
+			shc = !shc;
+#endif
+		}
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_init_halo_omp(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const int p_west, const int p_east,
+	const int p_south, const int p_north,
+	const int p_bottom, const int p_top)
+{
+	// west strip //
+	if (p_west) {
+		sor_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib - p_west, ib - 1,
+			jb - p_south, je + p_north,
+			kb - p_bottom, ke + p_top);
+	}
+	// east strip //
+	if (p_east) {
+		sor_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ie + 1, ie + p_east,
+			jb - p_south, je + p_north,
+			kb - p_bottom, ke + p_top);
+	}
+	// south strip //
+	if (p_south) {
+		sor_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			jb - p_south, jb - 1,
+			kb - p_bottom, ke + p_top);
+	}
+	// north strip //
+	if (p_north) {
+		sor_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			je + 1, je + p_north,
+			kb - p_bottom, ke + p_top);
+	}
+	// bottom strip //
+	if (p_bottom) {
+		sor_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb - p_bottom, kb - 1);
+	}
+	// top strip //
+	if (p_top) {
+		sor_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke + 1, ke + p_top);
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_cycle_init_halo_omp(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const int p_west, const int p_east,
+	const int p_south, const int p_north,
+	const int p_bottom, const int p_top,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	// west strip //
+	if (p_west) {
+		sor_cycle_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib - p_west, ib - 1,
+			jb - p_south, je + p_north,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// east strip //
+	if (p_east) {
+		sor_cycle_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ie + 1, ie + p_east,
+			jb - p_south, je + p_north,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// south strip //
+	if (p_south) {
+		sor_cycle_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			jb - p_south, jb - 1,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// north strip //
+	if (p_north) {
+		sor_cycle_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			je + 1, je + p_north,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// bottom strip //
+	if (p_bottom) {
+		sor_cycle_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb - p_bottom, kb - 1,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// top strip //
+	if (p_top) {
+		sor_cycle_init_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke + 1, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+}
+// -------------------------------------------------------------------- //
+
+template< typename T >
+inline void poisson3d::sor_cycle_halo_omp(T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg,
+	const T omega,
+	const int color,
+	const int nx, const int ny, const int nz,
+	const int ib, const int ie,
+	const int jb, const int je,
+	const int kb, const int ke,
+
+	const int p_west, const int p_east,
+	const int p_south, const int p_north,
+	const int p_bottom, const int p_top,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i)
+{
+	// west strip //
+	if (p_west) {
+		sor_cycle_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib - p_west, ib - 1,
+			jb - p_south, je + p_north,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// east strip //
+	if (p_east) {
+		sor_cycle_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ie + 1, ie + p_east,
+			jb - p_south, je + p_north,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// south strip //
+	if (p_south) {
+		sor_cycle_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			jb - p_south, jb - 1,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// north strip //
+	if (p_north) {
+		sor_cycle_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			je + 1, je + p_north,
+			kb - p_bottom, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// bottom strip //
+	if (p_bottom) {
+		sor_cycle_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb - p_bottom, kb - 1,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+	// top strip //
+	if (p_top) {
+		sor_cycle_omp(x, rhs, idg, omega, color, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			ke + 1, ke + p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+	}
+}
+// -------------------------------------------------------------------- //
diff --git a/pois-sor3d.cpp b/pois-sor3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..375c387d8a1780c87312d5ab3e7b2671ca5850c7
--- /dev/null
+++ b/pois-sor3d.cpp
@@ -0,0 +1,504 @@
+#include "pois-sor3d.h"
+#include "grid-common3d.h"
+#include "pois-bc3d.h"
+#include "pois-sor-base3d.h"
+
+
+namespace poisson3d
+{
+	// * SOR Red-Black call-partition * //
+	template< typename T >
+	void sor_start_omp(T* _RESTRICT x,
+		T* _RESTRICT rhs, const T* _RESTRICT const idg, const T omega,
+
+		const int color_mode,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc);
+
+	template< typename T >
+	void sor_run_omp(T* _RESTRICT x,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const idg, const T omega,
+
+		const int type, const int color_mode, const int piters,
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+// * SOR Red-Black preconditioner for Poisson equation * //
+template< typename T >
+void poisson3d::sor_redblack_omp(
+	T* _RESTRICT x,
+	T* _RESTRICT rhs, const T* _RESTRICT const idg, const T omega,
+
+	const int type, const int color_mode, const int piters,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc)
+{
+	if (type == c_sor_init) {
+		sor_start_omp(x, rhs, idg, omega, color_mode,
+			nx, ny, nz, gcx, gcy, gcz,
+			dx2i, dy2i,
+			dzp2i, dzm2i,
+
+			mpi_com, bc);
+	}
+
+	sor_run_omp(x, rhs, idg, omega,
+		type, color_mode, piters,
+		nx, ny, nz, gcx, gcy, gcz,
+		dx2i, dy2i,
+		dzp2i, dzm2i,
+
+		mpi_com, bc);
+}
+// ------------------------------------------------------------------------ //
+
+// * SOR-RB starting routine - some optimizations for case x = 0 * //
+template< typename T >
+void poisson3d::sor_start_omp(
+	T* _RESTRICT x,
+	T* _RESTRICT rhs, const T* _RESTRICT const idg, const T omega,
+
+	const int color_mode,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc)
+{
+	const int c_red = color_mode;
+	const int c_black = !color_mode;
+
+	const int ib = gcx, ie = nx - gcx - 1;
+	const int jb = gcy, je = ny - gcy - 1;
+	const int kb = gcz, ke = nz - gcz - 1;
+	const int nsize = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);
+
+	// Special cases //
+	// ----------------------------------------------------------- //
+	if ((mpi_com.size == 1) ||				// single MPI processor GS //
+		(nsize <= c_small_poisson_size))	// small problem size GS //
+	{
+		nse::null_halo_omp(x, nx, ny, nz,
+			ib - bc.p_west, ie + bc.p_east,
+			jb - bc.p_south, je + bc.p_north,
+			kb - bc.p_bottom, ke + bc.p_top);
+
+		// ghost exchange & periodicity //
+		// only [red] for combined red-black single sweep //
+		if (mpi_com.size == 1) {
+			if (bc.x_periodic) {
+				nse::apply_periodic_x_omp(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+					1, 1, 1);
+#pragma omp barrier
+			}
+			if (bc.y_periodic) {
+				nse::apply_periodic_y_omp(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+					1, 1, 1);
+#pragma omp barrier
+			}
+			if (bc.z_periodic) {
+				nse::apply_periodic_z_omp(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+					1, 1, 1);
+#pragma omp barrier
+			}
+		}
+		else
+		{
+			mpi_com.exchange_color_halo(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+				1, 1, 1, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+		}
+
+		// init: [red] with shift = - 1 //
+		sor_init_omp(x, rhs, idg, omega, c_red, nx, ny, nz,
+			ib - bc.p_west, ie + bc.p_east,
+			jb - bc.p_south, je + bc.p_north,
+			kb - bc.p_bottom, ke + bc.p_top);
+
+#pragma omp barrier
+
+		// [black] with shift = 0 //
+		sor_cycle_init_omp(x, rhs, idg, omega, c_black, nx, ny, nz,
+			ib, ie,
+			jb, je,
+			kb, ke,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+#pragma omp barrier
+
+		return;
+	}
+	// ----------------------------------------------------------- //
+
+
+	// MPI-Async[x,y] SOR //
+	// ----------------------------------------------------------- //
+	const int num_omp_threads = omp_get_num_threads();
+	MPI_Request mpi_req[4];
+
+	// ghost exchange & periodicity //
+	// only [red] for combined red-black single sweep //
+
+	// -x: push exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.push_exchange_color_halo_x(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.x_periodic, mpi_req);
+
+#pragma omp master
+	{
+		if ((mpi_com.size_x > 1) && (num_omp_threads > 1)) {
+			MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+			for (int k = 0; k < 4; k++)
+				mpi_req[k] = MPI_REQUEST_NULL;
+		}
+	}
+
+	nse::null_halo_omp(x, nx, ny, nz,
+		ib - bc.p_west, ie + bc.p_east,
+		jb - bc.p_south, je + bc.p_north,
+		kb - bc.p_bottom, ke + bc.p_top);
+
+	// [Red] init main block //
+	sor_init_omp(x, rhs, idg, omega, c_red, nx, ny, nz,
+		ib, ie,
+		jb, je,
+		kb, ke);
+
+	// -x: pop exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.pop_exchange_color_halo_x(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.x_periodic, mpi_req);
+
+	// -y: push exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.push_exchange_color_halo_y(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.y_periodic, mpi_req);
+
+#pragma omp master
+	{
+		if ((mpi_com.size_y > 1) && (num_omp_threads > 1)) {
+			MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+			for (int k = 0; k < 4; k++)
+				mpi_req[k] = MPI_REQUEST_NULL;
+		}
+	}
+
+	// [Black] main block //
+	sor_cycle_init_omp(x, rhs, idg, omega, c_black, nx, ny, nz,
+		ib + bc.p_west, ie - bc.p_east,
+		jb + bc.p_south, je - bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top,
+
+		dx2i, dy2i,
+		dzp2i, dzm2i);
+
+	// -y: pop exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.pop_exchange_color_halo_y(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.y_periodic, mpi_req);
+
+	// -z: push exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.push_exchange_color_halo_z(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.z_periodic, mpi_req);
+
+	// -z: pop exchange [rhs] [width=1, periodic=yes] //
+	mpi_com.pop_exchange_color_halo_z(rhs, c_red, nx, ny, nz, gcx, gcy, gcz,
+		1, 1, 1, bc.z_periodic, mpi_req);
+
+	// MPI-halo 
+	// ----------------------------------------------------------- //
+	// [Red] halo //
+	sor_init_halo_omp(x, rhs, idg, omega, c_red, nx, ny, nz,
+		ib, ie,
+		jb, je,
+		kb, ke,
+		bc.p_west, bc.p_east, bc.p_south, bc.p_north, bc.p_bottom, bc.p_top);
+
+#pragma omp barrier
+
+	// [Black] halo //
+	sor_cycle_init_halo_omp(x, rhs, idg, omega, c_black, nx, ny, nz,
+		ib + bc.p_west, ie - bc.p_east,
+		jb + bc.p_south, je - bc.p_north,
+		kb + bc.p_bottom, ke - bc.p_top,
+
+		bc.p_west, bc.p_east, bc.p_south, bc.p_north, bc.p_bottom, bc.p_top,
+
+		dx2i, dy2i,
+		dzp2i, dzm2i);
+
+#pragma omp barrier
+
+	// ----------------------------------------------------------- //
+}
+
+
+// * SOR-RB main cycle iteration * //
+template< typename T >
+void poisson3d::sor_run_omp(
+	T* _RESTRICT x,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const idg, const T omega,
+
+	const int type, const int color_mode, const int piters,
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc)
+{
+	const int niters = (type == c_sor_continue) ? piters : piters - 1;
+
+	const int c_red = color_mode;
+	const int c_black = !color_mode;
+
+	const int ib = gcx, ie = nx - gcx - 1;
+	const int jb = gcy, je = ny - gcy - 1;
+	const int kb = gcz, ke = nz - gcz - 1;
+	const int nsize = (ie - ib + 1) * (je - jb + 1) * (ke - kb + 1);
+
+
+	// Special cases //
+	// ----------------------------------------------------------- //
+	if ((mpi_com.size == 1) ||				// single MPI processor GS //
+		(nsize <= c_small_poisson_size))	// small problem size GS //
+	{
+		for (int m = 0; m < niters; m++)
+		{
+			// boundary conditions [red (previous), black]
+			put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+				mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+				mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+				bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+			// ghost exchange & periodicity //
+			// exchange [black] [width=2, periodic=yes] //
+			// exchange [red] [width=1, periodic=yes]	//
+			//		- additional exchange due to SOR-omega dependencies //
+			if (mpi_com.size == 1) {
+				if (bc.x_periodic) {
+					nse::apply_periodic_x_omp(x, nx, ny, nz, gcx, gcy, gcz,
+						2, 2, 2);
+#pragma omp barrier
+				}
+				if (bc.y_periodic) {
+					nse::apply_periodic_y_omp(x, nx, ny, nz, gcx, gcy, gcz,
+						2, 2, 2);
+#pragma omp barrier
+				}
+				if (bc.z_periodic) {
+					nse::apply_periodic_z_omp(x, nx, ny, nz, gcx, gcy, gcz,
+						2, 2, 2);
+#pragma omp barrier
+				}
+			}
+			else
+			{
+				mpi_com.exchange_halo(x, nx, ny, nz, gcx, gcy, gcz,
+					2, 2, 2, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+			}
+
+			// [red] with shift = - 1 //	
+			sor_cycle_omp(x, rhs, idg, omega, c_red, nx, ny, nz,
+				ib - bc.p_west, ie + bc.p_east,
+				jb - bc.p_south, je + bc.p_north,
+				kb - bc.p_bottom, ke + bc.p_top,
+
+				dx2i, dy2i,
+				dzp2i, dzm2i);
+
+#pragma omp barrier
+
+			// [black] with shift = 0 //
+			sor_cycle_omp(x, rhs, idg, omega, c_black, nx, ny, nz,
+				ib, ie,
+				jb, je,
+				kb, ke,
+
+				dx2i, dy2i,
+				dzp2i, dzm2i);
+
+
+			if (m < niters - 1) {
+#pragma omp barrier
+			}
+		}
+
+		return;
+	}
+	// ----------------------------------------------------------- //
+
+
+	// MPI-Async[x,y] GS //
+	// ----------------------------------------------------------- //
+	const int num_omp_threads = omp_get_num_threads();
+	MPI_Request mpi_req[4];
+
+	for (int m = 0; m < niters; m++)
+	{
+		// boundary conditions [red (previous), black]
+		put_bc_omp(x, nx, ny, nz, gcx, gcy, gcz,
+			mpi_com.rank_x, mpi_com.rank_y, mpi_com.rank_z,
+			mpi_com.size_x, mpi_com.size_y, mpi_com.size_z,
+			bc.type, bc.x_periodic, bc.y_periodic, bc.z_periodic);
+
+#pragma omp barrier
+
+		// ghost exchange & periodicity //
+		// [black] [width=2, periodic=yes]	//
+		// [red] [width=1, periodic=yes]	//
+		//		- additional exchange due to SOR-omega dependencies //
+
+		// -x: push exchange: [black][2], [red][1] [periodic=yes] //
+		mpi_com.push_exchange_halo_x(x, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.x_periodic, mpi_req);
+
+#pragma omp master
+		{
+			if ((mpi_com.size_x > 1) && (num_omp_threads > 1)) {
+				MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+				for (int k = 0; k < 4; k++)
+					mpi_req[k] = MPI_REQUEST_NULL;
+			}
+		}
+
+		// [Red] main block //
+		sor_cycle_omp(x, rhs, idg, omega, c_red, nx, ny, nz,
+			ib + bc.p_west, ie - bc.p_east,
+			jb + bc.p_south, je - bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+		// -x: pop exchange: [black][2], [red][1] [periodic=yes] //
+		mpi_com.pop_exchange_halo_x(x, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.x_periodic, mpi_req);
+
+		// -y: push exchange: [black][2], [red][1] [periodic=yes] //
+		mpi_com.push_exchange_halo_y(x, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.y_periodic, mpi_req);
+
+#pragma omp master
+		{
+			if ((mpi_com.size_y > 1) && (num_omp_threads > 1)) {
+				MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+				for (int k = 0; k < 4; k++)
+					mpi_req[k] = MPI_REQUEST_NULL;
+			}
+		}
+
+		// [Black] main block //
+		sor_cycle_omp(x, rhs, idg, omega, c_black, nx, ny, nz,
+			ib + 2 * bc.p_west, ie - 2 * bc.p_east,
+			jb + 2 * bc.p_south, je - 2 * bc.p_north,
+			kb + 2 * bc.p_bottom, ke - 2 * bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+		// -y: pop exchange: [black][2], [red][1] [periodic=yes] //
+		mpi_com.pop_exchange_halo_y(x, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.y_periodic, mpi_req);
+
+		// -z: push exchange: [black][2], [red][1] [periodic=yes] //
+		mpi_com.push_exchange_halo_z(x, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.z_periodic, mpi_req);
+
+		// -z: pop exchange: [black][2], [red][1] [periodic=yes] //
+		mpi_com.pop_exchange_halo_z(x, nx, ny, nz, gcx, gcy, gcz,
+			2, 2, 2, bc.z_periodic, mpi_req);
+
+		// MPI-halo 
+		// ----------------------------------------------------------- //
+		// [Red] halo //
+		sor_cycle_halo_omp(x, rhs, idg, omega, c_red, nx, ny, nz,
+			ib + bc.p_west, ie - bc.p_east,
+			jb + bc.p_south, je - bc.p_north,
+			kb + bc.p_bottom, ke - bc.p_top,
+
+			2 * bc.p_west, 2 * bc.p_east,
+			2 * bc.p_south, 2 * bc.p_north,
+			2 * bc.p_bottom, 2 * bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+#pragma omp barrier
+
+		// [Black] halo //
+		sor_cycle_halo_omp(x, rhs, idg, omega, c_black, nx, ny, nz,
+			ib + 2 * bc.p_west, ie - 2 * bc.p_east,
+			jb + 2 * bc.p_south, je - 2 * bc.p_north,
+			kb + 2 * bc.p_bottom, ke - 2 * bc.p_top,
+
+			2 * bc.p_west, 2 * bc.p_east,
+			2 * bc.p_south, 2 * bc.p_north,
+			2 * bc.p_bottom, 2 * bc.p_top,
+
+			dx2i, dy2i,
+			dzp2i, dzm2i);
+
+		if (m < niters - 1) {
+#pragma omp barrier
+		}
+
+	}
+	// ----------------------------------------------------------- //
+}
+
+// Initialization //
+// ------------------------------------------------------------------------ //
+
+// * initialize: SOR-preconditioner * //
+template void poisson3d::sor_redblack_omp(
+	float* _RESTRICT x,
+	float* _RESTRICT rhs, const float* _RESTRICT const idg, const float omega,
+
+	const int type, const int color_mode, const int piters,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const float dx2i, const float dy2i,
+	const float* _RESTRICT const dzp2i, const float* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+
+template void poisson3d::sor_redblack_omp(
+	double* _RESTRICT x,
+	double* _RESTRICT rhs, const double* _RESTRICT const idg, const double omega,
+
+	const int type, const int color_mode, const int piters,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+	const double dx2i, const double dy2i,
+	const double* _RESTRICT const dzp2i, const double* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com,
+	const nse::poisson_dynamic_bc& bc);
+// ------------------------------------------------------------------------ //
diff --git a/pois-sor3d.h b/pois-sor3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd5151d1d37d7decce31cd6501bf4a1e00e13480
--- /dev/null
+++ b/pois-sor3d.h
@@ -0,0 +1,86 @@
+#pragma once
+
+// [pois-sor3d.h]: 3D Poisson SOR
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "mpi-com3d.h"
+#include "pois-def3d.h"
+
+
+namespace poisson3d
+{
+	// * SOR modes * //
+	const int c_sor_init = 0;
+	const int c_sor_continue = 1;
+	// -------------------------------------------------------------------- //
+
+	// * SOR Red-Black preconditioner * //
+	template< typename T >
+	void sor_redblack(T* _RESTRICT x,
+		T* _RESTRICT rhs, const T* _RESTRICT const idg, const T omega,
+
+		const int type, const int color_mode, const int piters,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// OpenMP //
+namespace poisson3d
+{
+	// * SOR Red-Black preconditioner * //
+	template< typename T >
+	void sor_redblack_omp(T* _RESTRICT x,
+		T* _RESTRICT rhs, const T* _RESTRICT const idg, const T omega,
+
+		const int type, const int color_mode, const int piters,
+
+		const int nx, const int ny, const int nz,
+		const int gcx, const int gcy, const int gcz,
+
+		const T dx2i, const T dy2i,
+		const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+		const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc);
+	// -------------------------------------------------------------------- //
+}
+
+// Implementation //
+// -------------------------------------------------------------------- //
+template< typename T >
+void poisson3d::sor_redblack(T* _RESTRICT x,
+	T* _RESTRICT rhs, const T* _RESTRICT const idg, const T omega,
+
+	const int type, const int color_mode, const int piters,
+
+	const int nx, const int ny, const int nz,
+	const int gcx, const int gcy, const int gcz,
+
+	const T dx2i, const T dy2i,
+	const T* _RESTRICT const dzp2i, const T* _RESTRICT const dzm2i,
+
+	const nse::mpiCom3d& mpi_com, const nse::poisson_dynamic_bc& bc)
+{
+	if (omp_in_parallel()) {
+		sor_redblack_omp(x, rhs, idg, omega, type, color_mode, piters,
+			nx, ny, nz, gcx, gcy, gcz,
+			dx2i, dy2i, dzp2i, dzm2i, mpi_com, bc);
+	}
+	else
+	{
+#pragma omp parallel shared( x, rhs )
+		{
+			sor_redblack_omp(x, rhs, idg, omega, type, color_mode, piters,
+				nx, ny, nz, gcx, gcy, gcz,
+				dx2i, dy2i, dzp2i, dzm2i, mpi_com, bc);
+		}
+	}
+}
diff --git a/pois3d-x4.cpp b/pois3d-x4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..06d74129fd4ff84bd8b53c45f3b9c4e276a41a83
--- /dev/null
+++ b/pois3d-x4.cpp
@@ -0,0 +1,252 @@
+#include "pois3d-x4.h"
+
+#include "vecmath.h"
+#include "mpi-vecmath.h"
+#include "pois-base3d-x4.h"
+#include "pois-gs3d.h"
+#include "pois-sor3d.h"
+#include "pois-mg3d.h"
+
+#include <string.h>
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+
+// * BiCGstab MG(MPI) SOR (Red-Black) -X4 * //
+template< typename T >
+int nse::bicg_mg_sor_redblack_x4(
+	T* x, const T* const rhs, T** memory,
+	const poisOpt3d< T >& pois_opt,
+	const wstGrid3d< T >& grid,
+	mg_mpi_poisson3d_data< T >& mg_data,
+	T* resnorm)
+{
+#ifdef _POIS3D_EXCLUDE_XGHOST_VEC
+#ifdef ALIGN_ALLOCATION
+	const int mem_ptr = (grid.gcx * grid.nyz) -
+		((grid.gcx * grid.nyz) % (ALIGN_ALLOCATION / sizeof(T)));
+#else
+	const int mem_ptr = grid.gcx * grid.nyz;
+#endif
+#else
+	const int mem_ptr = 0;
+#endif
+	const int mem_size = grid.size - 2 * mem_ptr;
+
+	poisson_dynamic_bc bc;
+	bc.init(pois_opt.bc_type,
+		grid.mpi_com.rank_x, grid.mpi_com.rank_y, grid.mpi_com.rank_z,
+		grid.mpi_com.size_x, grid.mpi_com.size_y, grid.mpi_com.size_z);
+
+	T alpha, beta, gamma, delta, epsilon, rho, rho_star;
+	T norm_star, norm_current = (T) 0.0;
+
+	T *residual = memory[0],
+		*residual_star = memory[1],
+		*p = memory[2],
+		*q = memory[3],
+		*v = memory[4],
+		*w = memory[5];
+
+
+	if (pois_opt.init_mode == isInitNull)
+		memcpy(&residual[mem_ptr], &rhs[mem_ptr], mem_size * sizeof(T));
+	else
+		if (pois_opt.init_mode == isInitPreset)
+		{
+			poisson3d::resvec_x4(residual, x, rhs, grid.nx, grid.ny, grid.nz,
+				grid.gcx, grid.nx - grid.gcx - 1,
+				grid.gcy, grid.ny - grid.gcy - 1,
+				grid.gcz, grid.nz - grid.gcz - 1,
+
+				grid.dx2i, grid.dy2i,
+				grid.dzp2i, grid.dzm2i);
+		}
+		else
+		{
+			poisson3d::laplace_residual_x4(residual, x, rhs,
+				grid.nx, grid.ny, grid.nz,
+				grid.gcx, grid.gcy, grid.gcz,
+				grid.dx2i, grid.dy2i,
+				grid.dzp2i, grid.dzm2i,
+
+				grid.mpi_com, bc);
+		}
+
+	if (pois_opt.norm_mode == isNormL2) {
+		mpi_lnorm_and_sqr_sum(&residual[mem_ptr], mem_size, grid.mpi_com.comm,
+			&norm_star, &rho);
+	}
+	else
+	{
+		mpi_cnorm_and_sqr_sum(&residual[mem_ptr], mem_size, grid.mpi_com.comm,
+			&norm_star, &rho);
+	}
+
+	if ((pois_opt.miniters <= 0) && (norm_star < pois_opt.abstol)) {
+		(*resnorm) = norm_star;
+		return 0;
+	}
+
+	memcpy(&residual_star[mem_ptr], &residual[mem_ptr], mem_size * sizeof(T));
+
+	if (rho == (T)0) {
+		(*resnorm) = norm_star;
+		return -1;
+	}
+
+	poisson3d::mg_sor_redblack(q, residual_star, pois_opt.piters,
+		mg_data);
+
+	for (int k = 1; k <= pois_opt.maxiters; k++)
+	{
+#ifndef _POIS3D_LAPLACE_WITH_DP
+		poisson3d::laplace_x4(v, q,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+
+		gamma = mpi_dot_product(&residual_star[mem_ptr], &v[mem_ptr],
+			mem_size, grid.mpi_com.comm);
+#else
+		gamma = poisson3d::laplace_dp_x4(v, q, residual_star,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+#endif
+		alpha = rho / gamma;
+
+#ifndef _POIS3D_COMBINE_UPDATES
+		update(&x[mem_ptr], alpha, &q[mem_ptr], mem_size);
+		update(&residual[mem_ptr], -alpha, &v[mem_ptr], mem_size);
+#else
+		update(&x[mem_ptr], &residual[mem_ptr],
+			alpha, -alpha, &q[mem_ptr], &v[mem_ptr], mem_size);
+#endif
+
+		// - additional skirmish norm check //
+#ifdef _POIS3D_SKIRMISH_NORM
+
+		if (pois_opt.norm_mode == isNormL2) {
+			norm_current = mpi_lnorm(&residual[mem_ptr],
+				mem_size, grid.mpi_com.comm);
+		}
+		else
+		{
+			norm_current = mpi_cnorm(&residual[mem_ptr],
+				mem_size, grid.mpi_com.comm);
+		}
+
+		if ((k >= pois_opt.miniters) && (norm_current < pois_opt.abstol)) {
+			(*resnorm) = norm_current;
+			return k;
+		}
+#endif
+		// --------------------------------- //
+
+		poisson3d::mg_sor_redblack(q, residual, pois_opt.piters,
+			mg_data);
+
+		poisson3d::laplace_x4(w, q,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+
+		// (w, w) and (w, residual)
+		mpi_sqr_sum_and_dp(&w[mem_ptr], &residual[mem_ptr],
+			mem_size, grid.mpi_com.comm,
+			&delta, &epsilon);
+
+		gamma = epsilon / delta;
+
+#ifndef _POIS3D_COMBINE_UPDATES
+		update(&x[mem_ptr], gamma, &q[mem_ptr], mem_size);
+		update(&residual[mem_ptr], -gamma, &w[mem_ptr], mem_size);
+#else
+		update(&x[mem_ptr], &residual[mem_ptr],
+			gamma, -gamma, &q[mem_ptr], &w[mem_ptr], mem_size);
+#endif
+
+		if (gamma == (T)0) {
+			(*resnorm) = norm_current;
+			return -k - 1;
+		}
+
+		rho_star = rho;
+
+		if (pois_opt.norm_mode == isNormL2) {
+			mpi_lnorm_and_dp(&residual[mem_ptr], &residual_star[mem_ptr],
+				mem_size, grid.mpi_com.comm,
+				&norm_current, &rho);
+		}
+		else
+		{
+			mpi_cnorm_and_dp(&residual[mem_ptr], &residual_star[mem_ptr],
+				mem_size, grid.mpi_com.comm,
+				&norm_current, &rho);
+		}
+
+		if ((k >= pois_opt.miniters) &&
+			((norm_current < pois_opt.retol * norm_star) ||
+			(norm_current < pois_opt.abstol)))
+		{
+			(*resnorm) = norm_current;
+			return k;
+		}
+
+		if (rho == (T)0) {
+			(*resnorm) = norm_current;
+			return -k - 1;
+		}
+
+		beta = (rho / rho_star) * (alpha / gamma);
+
+		if (k == 1)
+			assign(&p[mem_ptr], (T) 1.0, &residual[mem_ptr], beta, &residual_star[mem_ptr],
+				-gamma * beta, &v[mem_ptr], mem_size);
+		else
+			assign(&p[mem_ptr], (T) 1.0, &residual[mem_ptr], beta, &p[mem_ptr],
+				-gamma * beta, &v[mem_ptr], mem_size);
+
+		poisson3d::mg_sor_redblack(q, p, pois_opt.piters,
+			mg_data);
+	}
+
+	(*resnorm) = norm_current;
+	return -pois_opt.maxiters;
+}
+// ------------------------------------------------------------------------ //
+
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+
+// Initialization //
+// ------------------------------------------------------------------------ //
+
+// initialize: BiCGstab Multigrid(MPI) SOR Red-Black -X4
+template int nse::bicg_mg_sor_redblack_x4(
+	float* x, const float* const rhs, float** memory,
+	const poisOpt3d< float >& pois_opt,
+	const wstGrid3d< float >& grid,
+	mg_mpi_poisson3d_data< float >& mg_data,
+	float* resnorm);
+
+template int nse::bicg_mg_sor_redblack_x4(
+	double* x, const double* const rhs, double** memory,
+	const poisOpt3d< double >& pois_opt,
+	const wstGrid3d< double >& grid,
+	mg_mpi_poisson3d_data< double >& mg_data,
+	double* resnorm);
+// ------------------------------------------------------------------------ //
diff --git a/pois3d-x4.h b/pois3d-x4.h
new file mode 100644
index 0000000000000000000000000000000000000000..487e3c92f9cf457bc99e89af87a7a2030ae31a3e
--- /dev/null
+++ b/pois3d-x4.h
@@ -0,0 +1,25 @@
+#pragma once
+
+// [pois3d-x4.h]: 3D Poisson[X4] Krylov solvers 
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "wstgrid3d.h"
+#include "pois-def3d.h"
+#include "mg-data3d.h"
+
+
+namespace nse
+{
+	// -------------------------------------------------------------------- //
+	// N - grid size
+
+	// * BiCGstab-Multigrid(MPI) GS-SOR (Red-Black) memory req.: [ 6 * N + MG() ] * // 
+	template< typename T >
+	int bicg_mg_sor_redblack_x4(T* x, const T* const rhs, T** memory,
+		const poisOpt3d< T >& pois_opt,
+		const wstGrid3d< T >& grid,
+		mg_mpi_poisson3d_data< T >& mg_data,
+		T* resnorm);
+	// -------------------------------------------------------------------- //
+}
diff --git a/pois3d.cpp b/pois3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0154741b80dae1d041b1ae3e05b33a9ab585f7d9
--- /dev/null
+++ b/pois3d.cpp
@@ -0,0 +1,693 @@
+#include "pois3d.h"
+
+#include "vecmath.h"
+#include "mpi-vecmath.h"
+#include "pois-base3d.h"
+#include "pois-gs3d.h"
+#include "pois-sor3d.h"
+#include "pois-mg3d.h"
+
+#include <string.h>
+
+// Implementation //
+// -------------------------------------------------------------------- //
+
+
+// * BiCGstab SOR (Red-Black) * //
+template< typename T >
+int nse::bicg_sor_redblack(
+	T* x, const T* const rhs, T** memory,
+	const poisOpt3d< T >& pois_opt,
+	const T omega,
+	const wstGrid3d< T >& grid,
+	T* resnorm)
+{
+	poisson_dynamic_bc bc;
+	bc.init(pois_opt.bc_type,
+		grid.mpi_com.rank_x, grid.mpi_com.rank_y, grid.mpi_com.rank_z,
+		grid.mpi_com.size_x, grid.mpi_com.size_y, grid.mpi_com.size_z);
+
+	T alpha, beta, gamma, delta, epsilon, rho, rho_star;
+	T norm_star, norm_current = (T) 0.0;
+
+	int offset_x = grid.mpi_com.offset_x(grid.nx, grid.gcx) + grid.gcx;
+	int offset_y = grid.mpi_com.offset_y(grid.ny, grid.gcy) + grid.gcy;
+	int offset_z = grid.mpi_com.offset_z(grid.nz, grid.gcz) + grid.gcz;
+	int sgs_mode = ((offset_x + offset_y + offset_z) & 1);
+
+	T *residual = memory[0],
+		*residual_star = memory[1],
+		*p = memory[2],
+		*q = memory[3],
+		*v = memory[4],
+		*w = memory[5],
+		*idg = memory[6];
+
+
+	poisson3d::set_diagonal_inverse(idg,
+		grid.nz,
+		grid.gcz, grid.nz - grid.gcz - 1,
+
+		grid.dx2i, grid.dy2i,
+		grid.dzp2i, grid.dzm2i);
+
+	if (pois_opt.init_mode == isInitNull)
+		memcpy(residual, rhs, grid.size * sizeof(T));
+	else
+		if (pois_opt.init_mode == isInitPreset)
+		{
+			poisson3d::resvec(residual, x, rhs, grid.nx, grid.ny, grid.nz,
+				grid.gcx, grid.nx - grid.gcx - 1,
+				grid.gcy, grid.ny - grid.gcy - 1,
+				grid.gcz, grid.nz - grid.gcz - 1,
+
+				grid.dx2i, grid.dy2i,
+				grid.dzp2i, grid.dzm2i);
+		}
+		else
+		{
+			poisson3d::laplace_residual(residual, x, rhs,
+				grid.nx, grid.ny, grid.nz,
+				grid.gcx, grid.gcy, grid.gcz,
+
+				grid.dx2i, grid.dy2i,
+				grid.dzp2i, grid.dzm2i,
+
+				grid.mpi_com, bc);
+		}
+
+	if (pois_opt.norm_mode == isNormL2) {
+		norm_star = mpi_lnorm(residual, grid.size, grid.mpi_com.comm);
+	}
+	else
+	{
+		norm_star = mpi_cnorm(residual, grid.size, grid.mpi_com.comm);
+	}
+
+	if ((pois_opt.miniters <= 0) && (norm_star < pois_opt.abstol)) {
+		(*resnorm) = norm_star;
+		return 0;
+	}
+	memcpy(residual_star, residual, grid.size * sizeof(T));
+	memcpy(p, residual, grid.size * sizeof(T));
+
+	rho = mpi_dot_product(residual, residual_star,
+		grid.size, grid.mpi_com.comm);
+	if (rho == (T)0) {
+		(*resnorm) = norm_star;
+		return -1;
+	}
+
+	for (int k = 1; k <= pois_opt.maxiters; k++)
+	{
+		poisson3d::gs_redblack(q, p, idg,
+			poisson3d::c_gs_init, sgs_mode, pois_opt.piters,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+
+#ifndef _POIS3D_LAPLACE_WITH_DP
+		poisson3d::laplace(v, q,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+
+		gamma = mpi_dot_product(residual_star, v,
+			grid.size, grid.mpi_com.comm);
+#else
+		gamma = poisson3d::laplace_dp(v, q, residual_star,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+#endif
+
+		alpha = rho / gamma;
+
+#ifndef _POIS3D_COMBINE_UPDATES
+		update(x, alpha, q, grid.size);
+		update(residual, -alpha, v, grid.size);
+#else
+		update(x, residual, alpha, -alpha, q, v, grid.size);
+#endif
+
+		// - additional skirmish norm check //
+#ifdef _POIS3D_SKIRMISH_NORM
+		if (pois_opt.norm_mode == isNormL2) {
+			norm_current = mpi_lnorm(residual, grid.size, grid.mpi_com.comm);
+		}
+		else
+		{
+			norm_current = mpi_cnorm(residual, grid.size, grid.mpi_com.comm);
+		}
+
+		if ((k >= pois_opt.miniters) && (norm_current < pois_opt.abstol)) {
+			(*resnorm) = norm_current;
+			return k;
+		}
+#endif
+		// --------------------------------- //
+
+		poisson3d::gs_redblack(q, residual, idg,
+			poisson3d::c_gs_init, sgs_mode, pois_opt.piters,
+
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+
+		poisson3d::laplace(w, q,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+
+		// (w, w) and (w, residual)
+		mpi_sqr_sum_and_dp(w, residual, grid.size, grid.mpi_com.comm,
+			&delta, &epsilon);
+
+		gamma = epsilon / delta;
+
+#ifndef _POIS3D_COMBINE_UPDATES
+		update(x, gamma, q, grid.size);
+		update(residual, -gamma, w, grid.size);
+#else
+		update(x, residual, gamma, -gamma, q, w, grid.size);
+#endif
+
+		if (gamma == (T)0) {
+			(*resnorm) = norm_current;
+			return -k - 1;
+		}
+
+		rho_star = rho;
+
+		if (pois_opt.norm_mode == isNormL2) {
+			mpi_lnorm_and_dp(residual, residual_star, grid.size, grid.mpi_com.comm,
+				&norm_current, &rho);
+		}
+		else
+		{
+			mpi_cnorm_and_dp(residual, residual_star, grid.size, grid.mpi_com.comm,
+				&norm_current, &rho);
+		}
+
+		if ((k >= pois_opt.miniters) &&
+			(
+			(norm_current < pois_opt.retol * norm_star) ||
+				(norm_current < pois_opt.abstol)
+				))
+		{
+			(*resnorm) = norm_current;
+			return k;
+		}
+
+		if (rho == (T)0) {
+			(*resnorm) = norm_current;
+			return -k - 1;
+		}
+
+		beta = (rho / rho_star) * (alpha / gamma);
+		assign(p, (T) 1.0, residual, beta, p, -gamma * beta, v, grid.size);
+	}
+
+	(*resnorm) = norm_current;
+	return -pois_opt.maxiters;
+}
+// ------------------------------------------------------------------------ //
+
+// * BiCGstab MG SOR (Red-Black) * //
+template< typename T >
+int nse::bicg_mg_sor_redblack(
+	T* x, const T* const rhs, T** memory,
+	const poisOpt3d< T >& pois_opt,
+	const wstGrid3d< T >& grid,
+	mg_poisson3d_data< T >& mg_data,
+	T* resnorm)
+{
+	poisson_dynamic_bc bc;
+	bc.init(pois_opt.bc_type,
+		grid.mpi_com.rank_x, grid.mpi_com.rank_y, grid.mpi_com.rank_z,
+		grid.mpi_com.size_x, grid.mpi_com.size_y, grid.mpi_com.size_z);
+
+	T alpha, beta, gamma, delta, epsilon, rho, rho_star;
+	T norm_star, norm_current = (T) 0.0;
+
+	T *residual = memory[0],
+		*residual_star = memory[1],
+		*p = memory[2],
+		*q = memory[3],
+		*v = memory[4],
+		*w = memory[5];
+
+
+	if (pois_opt.init_mode == isInitNull)
+		memcpy(residual, rhs, grid.size * sizeof(T));
+	else
+		if (pois_opt.init_mode == isInitPreset)
+		{
+			poisson3d::resvec(residual, x, rhs, grid.nx, grid.ny, grid.nz,
+				grid.gcx, grid.nx - grid.gcx - 1,
+				grid.gcy, grid.ny - grid.gcy - 1,
+				grid.gcz, grid.nz - grid.gcz - 1,
+
+				grid.dx2i, grid.dy2i,
+				grid.dzp2i, grid.dzm2i);
+		}
+		else
+		{
+			poisson3d::laplace_residual(residual, x, rhs,
+				grid.nx, grid.ny, grid.nz,
+				grid.gcx, grid.gcy, grid.gcz,
+				grid.dx2i, grid.dy2i,
+				grid.dzp2i, grid.dzm2i,
+
+				grid.mpi_com, bc);
+		}
+
+	if (pois_opt.norm_mode == isNormL2) {
+		norm_star = mpi_lnorm(residual, grid.size, grid.mpi_com.comm);
+	}
+	else
+	{
+		norm_star = mpi_cnorm(residual, grid.size, grid.mpi_com.comm);
+	}
+
+	if ((pois_opt.miniters <= 0) && (norm_star < pois_opt.abstol)) {
+		(*resnorm) = norm_star;
+		return 0;
+	}
+
+	memcpy(residual_star, residual, grid.size * sizeof(T));
+	memcpy(p, residual, grid.size * sizeof(T));
+
+	rho = mpi_sqr_sum(residual_star,
+		grid.size, grid.mpi_com.comm);
+
+	if (rho == (T)0) {
+		(*resnorm) = norm_star;
+		return -1;
+	}
+
+	for (int k = 1; k <= pois_opt.maxiters; k++)
+	{
+		poisson3d::mg_sor_redblack(q, p, pois_opt.piters,
+			mg_data, grid.mpi_com);
+
+#ifndef _POIS3D_LAPLACE_WITH_DP
+		poisson3d::laplace(v, q,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+
+		gamma = mpi_dot_product(residual_star, v,
+			grid.size, grid.mpi_com.comm);
+#else
+		gamma = poisson3d::laplace_dp(v, q, residual_star,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+#endif
+		alpha = rho / gamma;
+
+#ifndef _POIS3D_COMBINE_UPDATES
+		update(x, alpha, q, grid.size);
+		update(residual, -alpha, v, grid.size);
+#else
+		update(x, residual, alpha, -alpha, q, v, grid.size);
+#endif
+
+		// - additional skirmish norm check //
+#ifdef _POIS3D_SKIRMISH_NORM
+
+		if (pois_opt.norm_mode == isNormL2) {
+			norm_current = mpi_lnorm(residual, grid.size, grid.mpi_com.comm);
+		}
+		else
+		{
+			norm_current = mpi_cnorm(residual, grid.size, grid.mpi_com.comm);
+		}
+
+		if ((k >= pois_opt.miniters) && (norm_current < pois_opt.abstol)) {
+			(*resnorm) = norm_current;
+			return k;
+		}
+#endif
+		// --------------------------------- //
+
+		poisson3d::mg_sor_redblack(q, residual, pois_opt.piters,
+			mg_data, grid.mpi_com);
+
+		poisson3d::laplace(w, q,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+
+		// (w, w) and (w, residual)
+		mpi_sqr_sum_and_dp(w, residual, grid.size, grid.mpi_com.comm,
+			&delta, &epsilon);
+
+		gamma = epsilon / delta;
+
+#ifndef _POIS3D_COMBINE_UPDATES
+		update(x, gamma, q, grid.size);
+		update(residual, -gamma, w, grid.size);
+#else
+		update(x, residual, gamma, -gamma, q, w, grid.size);
+#endif
+
+		if (gamma == (T)0) {
+			(*resnorm) = norm_current;
+			return -k - 1;
+		}
+
+		rho_star = rho;
+
+		if (pois_opt.norm_mode == isNormL2) {
+			mpi_lnorm_and_dp(residual, residual_star, grid.size, grid.mpi_com.comm,
+				&norm_current, &rho);
+		}
+		else
+		{
+			mpi_cnorm_and_dp(residual, residual_star, grid.size, grid.mpi_com.comm,
+				&norm_current, &rho);
+		}
+
+		if ((k >= pois_opt.miniters) &&
+			(
+			(norm_current < pois_opt.retol * norm_star) ||
+				(norm_current < pois_opt.abstol)))
+		{
+
+			(*resnorm) = norm_current;
+			return k;
+		}
+
+		if (rho == (T)0) {
+			(*resnorm) = norm_current;
+			return -k - 1;
+		}
+
+		beta = (rho / rho_star) * (alpha / gamma);
+		assign(p, (T) 1.0, residual, beta, p, -gamma * beta, v, grid.size);
+	}
+
+	(*resnorm) = norm_current;
+	return -pois_opt.maxiters;
+}
+// ------------------------------------------------------------------------ //
+
+// * BiCGstab MG(MPI) SOR (Red-Black) * //
+template< typename T >
+int nse::bicg_mg_sor_redblack(
+	T* x, const T* const rhs, T** memory,
+	const poisOpt3d< T >& pois_opt,
+	const wstGrid3d< T >& grid,
+	mg_mpi_poisson3d_data< T >& mg_data,
+	T* resnorm)
+{
+#ifdef _POIS3D_EXCLUDE_XGHOST_VEC
+#ifdef ALIGN_ALLOCATION
+	const int mem_ptr = (grid.gcx * grid.nyz) -
+		((grid.gcx * grid.nyz) % (ALIGN_ALLOCATION / sizeof(T)));
+#else
+	const int mem_ptr = grid.gcx * grid.nyz;
+#endif
+#else
+	const int mem_ptr = 0;
+#endif
+	const int mem_size = grid.size - 2 * mem_ptr;
+
+	poisson_dynamic_bc bc;
+	bc.init(pois_opt.bc_type,
+		grid.mpi_com.rank_x, grid.mpi_com.rank_y, grid.mpi_com.rank_z,
+		grid.mpi_com.size_x, grid.mpi_com.size_y, grid.mpi_com.size_z);
+
+	T alpha, beta, gamma, delta, epsilon, rho, rho_star;
+	T norm_star, norm_current = (T) 0.0;
+
+	T *residual = memory[0],
+		*residual_star = memory[1],
+		*p = memory[2],
+		*q = memory[3],
+		*v = memory[4],
+		*w = memory[5];
+
+
+	if (pois_opt.init_mode == isInitNull)
+		memcpy(&residual[mem_ptr], &rhs[mem_ptr], mem_size * sizeof(T));
+	else
+		if (pois_opt.init_mode == isInitPreset)
+		{
+			poisson3d::resvec(residual, x, rhs, grid.nx, grid.ny, grid.nz,
+				grid.gcx, grid.nx - grid.gcx - 1,
+				grid.gcy, grid.ny - grid.gcy - 1,
+				grid.gcz, grid.nz - grid.gcz - 1,
+
+				grid.dx2i, grid.dy2i,
+				grid.dzp2i, grid.dzm2i);
+		}
+		else
+		{
+			poisson3d::laplace_residual(residual, x, rhs,
+				grid.nx, grid.ny, grid.nz,
+				grid.gcx, grid.gcy, grid.gcz,
+				grid.dx2i, grid.dy2i,
+				grid.dzp2i, grid.dzm2i,
+
+				grid.mpi_com, bc);
+		}
+
+	if (pois_opt.norm_mode == isNormL2) {
+		mpi_lnorm_and_sqr_sum(&residual[mem_ptr], mem_size, grid.mpi_com.comm,
+			&norm_star, &rho);
+	}
+	else
+	{
+		mpi_cnorm_and_sqr_sum(&residual[mem_ptr], mem_size, grid.mpi_com.comm,
+			&norm_star, &rho);
+	}
+
+	if ((norm_star == (T)0) || ((pois_opt.miniters <= 0) && (norm_star < pois_opt.abstol))) {
+		(*resnorm) = norm_star;
+		return 0;
+	}
+
+	memcpy(&residual_star[mem_ptr], &residual[mem_ptr], mem_size * sizeof(T));
+
+	if (rho == (T)0) {
+		(*resnorm) = norm_star;
+		return -1;
+	}
+
+	poisson3d::mg_sor_redblack(q, residual_star, pois_opt.piters,
+		mg_data);
+
+	for (int k = 1; k <= pois_opt.maxiters; k++)
+	{
+
+#ifndef _POIS3D_LAPLACE_WITH_DP
+		poisson3d::laplace(v, q,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+
+		gamma = mpi_dot_product(&residual_star[mem_ptr], &v[mem_ptr],
+			mem_size, grid.mpi_com.comm);
+#else
+		gamma = poisson3d::laplace_dp(v, q, residual_star,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+#endif	
+
+		alpha = rho / gamma;
+
+#ifndef _POIS3D_COMBINE_UPDATES
+		update(&x[mem_ptr], alpha, &q[mem_ptr], mem_size);
+		update(&residual[mem_ptr], -alpha, &v[mem_ptr], mem_size);
+#else
+		update(&x[mem_ptr], &residual[mem_ptr], alpha, -alpha,
+			&q[mem_ptr], &v[mem_ptr], mem_size);
+#endif
+
+		// - additional skirmish norm check //
+#ifdef _POIS3D_SKIRMISH_NORM
+
+		if (pois_opt.norm_mode == isNormL2) {
+			norm_current = mpi_lnorm(&residual[mem_ptr],
+				mem_size, grid.mpi_com.comm);
+		}
+		else
+		{
+			norm_current = mpi_cnorm(&residual[mem_ptr],
+				mem_size, grid.mpi_com.comm);
+		}
+
+		if ((k >= pois_opt.miniters) && (norm_current < pois_opt.abstol)) {
+			(*resnorm) = norm_current;
+			return k;
+		}
+#endif
+		// --------------------------------- //
+
+		poisson3d::mg_sor_redblack(q, residual, pois_opt.piters,
+			mg_data);
+
+		poisson3d::laplace(w, q,
+			grid.nx, grid.ny, grid.nz,
+			grid.gcx, grid.gcy, grid.gcz,
+			grid.dx2i, grid.dy2i,
+			grid.dzp2i, grid.dzm2i,
+
+			grid.mpi_com, bc);
+
+		// (w, w) and (w, residual)
+		mpi_sqr_sum_and_dp(&w[mem_ptr], &residual[mem_ptr],
+			mem_size, grid.mpi_com.comm,
+			&delta, &epsilon);
+
+		gamma = epsilon / delta;
+
+#ifndef _POIS3D_COMBINE_UPDATES
+		update(&x[mem_ptr], gamma, &q[mem_ptr], mem_size);
+		update(&residual[mem_ptr], -gamma, &w[mem_ptr], mem_size);
+#else
+		update(&x[mem_ptr], &residual[mem_ptr], gamma, -gamma,
+			&q[mem_ptr], &w[mem_ptr], mem_size);
+#endif
+
+		if (gamma == (T)0) {
+			(*resnorm) = norm_current;
+			return -k - 1;
+		}
+
+		rho_star = rho;
+
+		if (pois_opt.norm_mode == isNormL2) {
+			mpi_lnorm_and_dp(&residual[mem_ptr], &residual_star[mem_ptr],
+				mem_size, grid.mpi_com.comm,
+				&norm_current, &rho);
+		}
+		else
+		{
+			mpi_cnorm_and_dp(&residual[mem_ptr], &residual_star[mem_ptr],
+				mem_size, grid.mpi_com.comm,
+				&norm_current, &rho);
+		}
+
+		if ((k >= pois_opt.miniters) &&
+			(
+			(norm_current < pois_opt.retol * norm_star) ||
+				(norm_current < pois_opt.abstol)
+				))
+		{
+			(*resnorm) = norm_current;
+			return k;
+		}
+
+		if (rho == (T)0) {
+			(*resnorm) = norm_current;
+			return -k - 1;
+		}
+
+		beta = (rho / rho_star) * (alpha / gamma);
+
+		if (k == 1)
+			assign(&p[mem_ptr], (T) 1.0, &residual[mem_ptr], beta, &residual_star[mem_ptr],
+				-gamma * beta, &v[mem_ptr], mem_size);
+		else
+			assign(&p[mem_ptr], (T) 1.0, &residual[mem_ptr], beta, &p[mem_ptr],
+				-gamma * beta, &v[mem_ptr], mem_size);
+
+		poisson3d::mg_sor_redblack(q, p, pois_opt.piters,
+			mg_data);
+	}
+
+	(*resnorm) = norm_current;
+	return -pois_opt.maxiters;
+}
+// ------------------------------------------------------------------------ //
+
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+
+// Initialization //
+// ------------------------------------------------------------------------ //
+
+// initialize: BiCGstab SOR Red-Black
+template int nse::bicg_sor_redblack(
+	float* x, const float* const rhs, float** memory,
+	const poisOpt3d< float >& pois_opt,
+	const float omega,
+	const wstGrid3d< float >& grid,
+	float* resnorm);
+
+template int nse::bicg_sor_redblack(
+	double* x, const double* const rhs, double** memory,
+	const poisOpt3d< double >& pois_opt,
+	const double omega,
+	const wstGrid3d< double >& grid,
+	double* resnorm);
+// ------------------------------------------------------------------------ //
+
+// initialize: BiCGstab Multigrid SOR Red-Black
+template int nse::bicg_mg_sor_redblack(
+	float* x, const float* const rhs, float** memory,
+	const poisOpt3d< float >& pois_opt,
+	const wstGrid3d< float >& grid,
+	mg_poisson3d_data< float >& mg_data,
+	float* resnorm);
+
+template int nse::bicg_mg_sor_redblack(
+	double* x, const double* const rhs, double** memory,
+	const poisOpt3d< double >& pois_opt,
+	const wstGrid3d< double >& grid,
+	mg_poisson3d_data< double >& mg_data,
+	double* resnorm);
+// ------------------------------------------------------------------------ //
+
+// initialize: BiCGstab Multigrid(MPI) SOR Red-Black
+template int nse::bicg_mg_sor_redblack(
+	float* x, const float* const rhs, float** memory,
+	const poisOpt3d< float >& pois_opt,
+	const wstGrid3d< float >& grid,
+	mg_mpi_poisson3d_data< float >& mg_data,
+	float* resnorm);
+
+template int nse::bicg_mg_sor_redblack(
+	double* x, const double* const rhs, double** memory,
+	const poisOpt3d< double >& pois_opt,
+	const wstGrid3d< double >& grid,
+	mg_mpi_poisson3d_data< double >& mg_data,
+	double* resnorm);
+// ------------------------------------------------------------------------ //
diff --git a/pois3d.h b/pois3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b791c6503ae828a942daad0b5d62ac4e39708dd
--- /dev/null
+++ b/pois3d.h
@@ -0,0 +1,45 @@
+#pragma once
+
+// [pois3d.h]: 3D Poisson[X2] Krylov solvers 
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "wstgrid3d.h"
+#include "pois-def3d.h"
+#include "mg-data3d.h"
+
+
+namespace nse
+{
+	// -------------------------------------------------------------------- //
+	// N - grid size
+
+	// * BiCGstab-SOR (Red-Black) memory req.: [6 * N + NZ] * //
+	template< typename T >
+	int bicg_sor_redblack(T* x, const T* const rhs, T** memory,
+		const poisOpt3d< T >& pois_opt,
+		const T omega,
+		const wstGrid3d< T >& grid,
+		T* resnorm);
+	// -------------------------------------------------------------------- //
+
+
+	// * BiCGstab-Multigrid GS-SOR (Red-Black) memory req.: [ 6 * N + MG() ] * //
+	template< typename T >
+	int bicg_mg_sor_redblack(T* x, const T* const rhs, T** memory,
+		const poisOpt3d< T >& pois_opt,
+		const wstGrid3d< T >& grid,
+		mg_poisson3d_data< T >& mg_data,
+		T* resnorm);
+	// -------------------------------------------------------------------- //
+
+
+	// * BiCGstab-Multigrid(MPI) GS-SOR (Red-Black) memory req.: [ 6 * N + MG() ] * // 
+	template< typename T >
+	int bicg_mg_sor_redblack(T* x, const T* const rhs, T** memory,
+		const poisOpt3d< T >& pois_opt,
+		const wstGrid3d< T >& grid,
+		mg_mpi_poisson3d_data< T >& mg_data,
+		T* resnorm);
+	// -------------------------------------------------------------------- //
+}
diff --git a/ptcl-track-vec3d.cpp b/ptcl-track-vec3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe83632bcc2e93f17a4d5ec0b826b3a419deb60c
--- /dev/null
+++ b/ptcl-track-vec3d.cpp
@@ -0,0 +1,621 @@
+#define _CRT_SECURE_NO_WARNINGS
+#include "ptcl-track-vec3d.h"
+
+#include "nse-alloc.h"
+#include "vecmath.h"
+#include "mtrand.h"
+#include "str-com.h"
+
+#include <stdio.h>
+
+
+// Implementation
+// ----------------------------------------------------------------------------
+template< typename T >
+nse::ptclTrackVec3d< T >::ptclTrackVec3d()
+	: ptclVec3d< T >()
+{
+	current_tag = 0;
+}
+
+template< typename T >
+nse::ptclTrackVec3d< T >::ptclTrackVec3d(
+	const ptclTrackVec3d< T >& pvec)
+	: ptclVec3d< T >(pvec)
+{
+	current_tag = pvec.current_tag;
+
+	if (mem_size > 0) allocate_vnull(&tag, mem_size);
+	if (n > 0) mcopy(tag, pvec.tag, n);
+}
+
+template< typename T >
+nse::ptclTrackVec3d< T >::~ptclTrackVec3d()
+{
+	free();
+}
+// ----------------------------------------------------------------------------
+
+
+// add-init particle subroutines
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclTrackVec3d< T >::add(
+	const T xpos, const T ypos, const T zpos,
+	const T uval, const T vval, const T wval, const Grid3d< T >& grid)
+{
+	// adding particle to local MPI process only
+	int ipos = grid.locate_x(xpos),
+		jpos = grid.locate_y(ypos),
+		kpos = grid.locate_z(zpos);
+
+	if ((ipos != -1) && (jpos != -1) && (kpos != -1))
+	{
+		resize(n + 1);	// adding memory if necessary
+
+		x[n] = xpos;
+		y[n] = ypos;
+		z[n] = zpos;
+
+		up[n] = uval;
+		vp[n] = vval;
+		wp[n] = wval;
+
+		ic[n] = ipos;
+		jc[n] = jpos;
+		kc[n] = kpos;
+
+		tag[n] = current_tag;
+		current_tag++;
+
+		n++;
+	}
+
+	mpi_allreduce(&current_tag, MPI_MAX, grid.mpi_com.comm);
+}
+
+template< typename T >
+void nse::ptclTrackVec3d< T >::add(
+	const T xpos, const T ypos, const T zpos,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const Grid3d< T >& grid)
+{
+	// adding particle to local MPI process only
+	int ipos = grid.locate_x(xpos),
+		jpos = grid.locate_y(ypos),
+		kpos = grid.locate_z(zpos);
+
+	if ((ipos != -1) && (jpos != -1) && (kpos != -1))
+	{
+		resize(n + 1);	// adding memory if necessary
+
+		x[n] = xpos;
+		y[n] = ypos;
+		z[n] = zpos;
+
+		up[n] = grid.u_interp_local(U, xpos, ypos, zpos, ipos, jpos, kpos);
+		vp[n] = grid.v_interp_local(V, xpos, ypos, zpos, ipos, jpos, kpos);
+		wp[n] = grid.w_interp_local(W, xpos, ypos, zpos, ipos, jpos, kpos);
+
+		ic[n] = ipos;
+		jc[n] = jpos;
+		kc[n] = kpos;
+
+		tag[n] = current_tag;
+		current_tag++;
+
+		n++;
+	}
+
+	mpi_allreduce(&current_tag, MPI_MAX, grid.mpi_com.comm);
+}
+// ----------------------------------------------------------------------------
+
+// clear-free subroutines
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclTrackVec3d< T >::clear()
+{
+	current_tag = 0;
+	ptclVec3d< T >::clear();
+}
+
+template< typename T >
+void nse::ptclTrackVec3d< T >::free()
+{
+	if (mem_size > 0) deallocate(tag);
+	current_tag = 0;
+
+	ptclVec3d< T >::free();
+}
+// ----------------------------------------------------------------------------
+
+// PRIVATE:
+// ----------------------------------------------------------------------------
+
+// resize memory for requested number of elements
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclTrackVec3d< T >::resize(const int m)
+{
+	if (m > mem_size)
+	{
+		int *tag_mem;
+
+		const int nalloc = max(m, mem_size + mem_alloc);
+
+		allocate_vnull(&tag_mem, nalloc);
+
+		if (n > 0) mcopy(tag_mem, tag, n);
+		if (mem_size > 0) deallocate(tag);
+
+		tag = tag_mem;
+	}
+
+	ptclVec3d< T >::resize(m);
+}
+// ----------------------------------------------------------------------------
+
+// swap i,j particles pair
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclTrackVec3d< T >::swap(const int i, const int j)
+{
+	ptclVec3d< T >::swap(i, j);
+	int _tag = tag[i]; tag[i] = tag[j]; tag[j] = _tag;
+}
+// ----------------------------------------------------------------------------
+
+// add(get) particle data to(from) linear memory
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclTrackVec3d< T >::add_local(
+	const T* _RESTRICT const buf, const int* _RESTRICT const ibuf, const int nbuf)
+{
+	resize(n + nbuf);
+
+	for (int i = 0; i < nbuf; i++)
+	{
+		x[n] = buf[6 * i];
+		y[n] = buf[6 * i + 1];
+		z[n] = buf[6 * i + 2];
+
+		up[n] = buf[6 * i + 3];
+		vp[n] = buf[6 * i + 4];
+		wp[n] = buf[6 * i + 5];
+
+		tag[n] = ibuf[i];
+
+		n++;
+	}
+}
+
+template< typename T >
+void nse::ptclTrackVec3d< T >::get_local(
+	T* _RESTRICT buf, int* _RESTRICT ibuf, const int idx, const int np) const
+{
+	for (int i = 0; i < np; i++)
+	{
+		buf[6 * i] = x[idx + i];
+		buf[6 * i + 1] = y[idx + i];
+		buf[6 * i + 2] = z[idx + i];
+
+		buf[6 * i + 3] = up[idx + i];
+		buf[6 * i + 4] = vp[idx + i];
+		buf[6 * i + 5] = wp[idx + i];
+
+		ibuf[i] = tag[idx + i];
+	}
+}
+// ----------------------------------------------------------------------------
+
+
+// mpi-exchanges
+// ----------------------------------------------------------------------------
+template< typename T >
+int nse::ptclTrackVec3d< T >::mpi_exchange_line(
+	const int* exch_imem, const int nexch,
+	const int rank, const int size, const MPI_Comm comm, const int period)
+{
+	T *sbuf_real[2], *rbuf_real[2];			// send-recv buffers
+	int *sbuf_int[2], *rbuf_int[2];
+
+	int sbuf_real_id[2], rbuf_real_id[2];	// buffers id's for memory handling
+	int sbuf_int_id[2], rbuf_int_id[2];
+
+	int nsend[2], nrecv[2];		// number of send-recv elements
+	MPI_Request mpi_req[12];
+
+
+	if (size == 1) {	// handling degenerate case
+		if (!period) {
+			n = n - nexch;	// simply removing particles
+			return 0;
+		}
+		return nexch;
+	}
+
+
+	const bool is_exch_left = (size > 1) &&
+		((period) || (rank > 0));				// "left" exchange flag
+	const bool is_exch_right = (size > 1) &&
+		((period) || (rank < size - 1));		// "right" exchange flag
+
+	const int pidx_left = (rank > 0) ? rank - 1 : size - 1;
+	const int pidx_right = (rank < size - 1) ? rank + 1 : 0;
+
+	for (int i = 0; i < 12; i++)
+		mpi_req[i] = MPI_REQUEST_NULL;
+
+	sbuf_real[0] = NULL; sbuf_real[1] = NULL;
+	rbuf_real[0] = NULL; rbuf_real[1] = NULL;
+
+	sbuf_int[0] = NULL; sbuf_int[1] = NULL;
+	rbuf_int[0] = NULL; rbuf_int[1] = NULL;
+
+	nrecv[0] = 0; nrecv[1] = 0;
+	nsend[0] = 0; nsend[1] = 0;
+
+
+	if (is_exch_left)
+	{
+		MPI_Irecv(&nrecv[0], 1, MPI_INT, pidx_left, 0, comm, &mpi_req[0]);
+
+		for (int i = n - nexch; i < n; i++)
+			if (exch_imem[i] == -1) nsend[0]++;
+
+		if (nsend[0] > 0) {
+			sbuf_real_id[0] = memStx::get_buf(&sbuf_real[0], 6 * nsend[0]);
+			sbuf_int_id[0] = memStx::get_buf(&sbuf_int[0], nsend[0]);
+		}
+
+		MPI_Isend(&nsend[0], 1, MPI_INT, pidx_left, 0, comm, &mpi_req[2]);
+
+		int idx = 0;
+		for (int i = n - nexch; i < n; i++) {
+			if (exch_imem[i] == -1) {
+				get_local(&sbuf_real[0][6 * idx], &sbuf_int[0][idx], i, 1);
+				idx++;
+			}
+		}
+	}
+	if (is_exch_right)
+	{
+		MPI_Irecv(&nrecv[1], 1, MPI_INT, pidx_right, 0, comm, &mpi_req[1]);
+
+		for (int i = n - nexch; i < n; i++)
+			if (exch_imem[i] == 1) nsend[1]++;
+
+		if (nsend[1] > 0) {
+			sbuf_real_id[1] = memStx::get_buf(&sbuf_real[1], 6 * nsend[1]);
+			sbuf_int_id[1] = memStx::get_buf(&sbuf_int[1], nsend[1]);
+		}
+
+		MPI_Isend(&nsend[1], 1, MPI_INT, pidx_right, 0, comm, &mpi_req[3]);
+
+		int idx = 0;
+		for (int i = n - nexch; i < n; i++) {
+			if (exch_imem[i] == 1) {
+				get_local(&sbuf_real[1][6 * idx], &sbuf_int[1][idx], i, 1);
+				idx++;
+			}
+		}
+	}
+
+	MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+
+	if (nsend[0] > 0) {
+		MPI_Isend(sbuf_real[0], 6 * nsend[0], mpi_type< T >(), pidx_left, 1, comm, &mpi_req[4]);
+		MPI_Isend(sbuf_int[0], nsend[0], mpi_type< int >(), pidx_left, 2, comm, &mpi_req[5]);
+	}
+
+	if (nrecv[0] > 0) {
+		rbuf_real_id[0] = memStx::get_buf(&rbuf_real[0], 6 * nrecv[0]);
+		rbuf_int_id[0] = memStx::get_buf(&rbuf_int[0], nrecv[0]);
+
+		MPI_Irecv(rbuf_real[0], 6 * nrecv[0], mpi_type< T >(), pidx_left, 1, comm, &mpi_req[8]);
+		MPI_Irecv(rbuf_int[0], nrecv[0], mpi_type< int >(), pidx_left, 2, comm, &mpi_req[9]);
+	}
+
+	if (nsend[1] > 0) {
+		MPI_Isend(sbuf_real[1], 6 * nsend[1], mpi_type< T >(), pidx_right, 1, comm, &mpi_req[6]);
+		MPI_Isend(sbuf_int[1], nsend[1], mpi_type< int >(), pidx_right, 2, comm, &mpi_req[7]);
+	}
+
+	if (nrecv[1] > 0) {
+		rbuf_real_id[1] = memStx::get_buf(&rbuf_real[1], 6 * nrecv[1]);
+		rbuf_int_id[1] = memStx::get_buf(&rbuf_int[1], nrecv[1]);
+
+		MPI_Irecv(rbuf_real[1], 6 * nrecv[1], mpi_type< T >(), pidx_right, 1, comm, &mpi_req[10]);
+		MPI_Irecv(rbuf_int[1], nrecv[1], mpi_type< int >(), pidx_right, 2, comm, &mpi_req[11]);
+	}
+
+	MPI_Waitall(8, &mpi_req[4], MPI_STATUSES_IGNORE);
+
+	n = n - nexch;
+	if (nrecv[0] > 0) add_local(rbuf_real[0], rbuf_int[0], nrecv[0]);
+	if (nrecv[1] > 0) add_local(rbuf_real[1], rbuf_int[1], nrecv[1]);
+
+	if (nrecv[0] > 0) {
+		memStx::free_buf(rbuf_real_id[0]);
+		memStx::free_buf(rbuf_int_id[0]);
+	}
+	if (nrecv[1] > 0) {
+		memStx::free_buf(rbuf_real_id[1]);
+		memStx::free_buf(rbuf_int_id[1]);
+	}
+	if (nsend[0] > 0) {
+		memStx::free_buf(sbuf_real_id[0]);
+		memStx::free_buf(sbuf_int_id[0]);
+	}
+	if (nsend[1] > 0) {
+		memStx::free_buf(sbuf_real_id[1]);
+		memStx::free_buf(sbuf_int_id[1]);
+	}
+
+	return nrecv[0] + nrecv[1];
+}
+// ----------------------------------------------------------------------------
+
+// PUBLIC (I/O subroutines):
+// ----------------------------------------------------------------------------
+
+// write binary
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::ptclTrackVec3d< T >::write_binary(
+	const std::string& filename,
+	const Grid3d< T >& grid, const T time) const
+{
+#ifdef _NSE_MPI_IO	// forcing MPI-IO...
+	bool mpi_io_status = mpi_write_binary(filename, _NSE_MPI_IO_DATAREP_DEFAULT, grid, time);
+#ifndef _NSE_MPI_IO_RETRY_SEQ
+	return mpi_io_status;
+#else
+	if (mpi_io_status) return true;
+#endif
+#endif
+
+#if (!defined(_NSE_MPI_IO) || \
+	(defined(_NSE_MPI_IO) && defined(_NSE_MPI_IO_RETRY_SEQ)))
+
+	bool base_status = ptclVec3d< T >::write_binary(filename, grid, time);
+	if (!base_status) return false;
+
+	//
+	// adding tracking data to file ...
+	//
+
+	const int host = 0;
+	int nall = mpi_get_num(grid);
+
+	int *buf;
+	if (grid.mpi_com.rank == host)
+		if (nall > 0) allocate_vnull(&buf, nall);
+
+	mpi_gather_vec(tag, n, buf, host);
+
+	int status = 0;
+	if (grid.mpi_com.rank == host)
+	{
+		FILE* ptr = fopen(filename.c_str(), "ab");
+		if (ptr != NULL)
+		{
+			// writing current tag value
+			fwrite(&current_tag, sizeof(int), 1, ptr);
+
+			if (nall > 0)
+				fwrite(buf, sizeof(int), nall, ptr);
+
+			fclose(ptr);
+			status = 1;
+		}
+
+		if (nall > 0) deallocate(buf);
+	}
+
+	mpi_broadcast(&status, 1, host, grid.mpi_com.comm);
+	return (status == 1);
+#endif
+}
+
+template< typename T >
+bool nse::ptclTrackVec3d< T >::write_binary(
+	const std::string& filename, const int idx,
+	const Grid3d< T >& grid, const T time) const
+{
+	return write_binary(append_index(filename, idx), grid, time);
+}
+// ----------------------------------------------------------------------------
+
+
+// MPI write binary
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::ptclTrackVec3d< T >::mpi_write_binary(
+	const std::string& filename,
+	const char* mpi_datarep, const Grid3d< T >& grid, const T time) const
+{
+	bool base_status = ptclVec3d< T >::mpi_write_binary(filename, mpi_datarep, grid, time);
+	if (!base_status) return false;
+
+	//
+	// adding tracking data to file ...
+	//
+	const int host = 0;
+
+	MPI_File ptr;
+	int status = MPI_File_open(grid.mpi_com.comm, (char*)filename.c_str(),
+		MPI_MODE_WRONLY, MPI_INFO_NULL, &ptr);
+	if (status != MPI_SUCCESS) return false;	// MPI file open failure
+
+	MPI_Offset base_size;	// getting base file size
+	status = MPI_File_get_size(ptr, &base_size);
+	if (status != MPI_SUCCESS) {
+		MPI_File_close(&ptr);
+		return false;
+	}
+
+	int nstatus = 0;
+	// writing current tag value
+	if (grid.mpi_com.rank == host) {
+		MPI_File_seek(ptr, 0, MPI_SEEK_END);
+
+		int ctag = current_tag;
+		status = MPI_File_write(ptr, &ctag, 1, mpi_type< int >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+	}
+	MPI_File_sync(ptr);
+	mpi_broadcast(&nstatus, 1, host, grid.mpi_com.comm);
+
+	// main data //
+	int np = n, incdisp;
+	MPI_Scan(&np, &incdisp, 1, MPI_INT, MPI_SUM, grid.mpi_com.comm);
+	MPI_Offset disp = base_size + sizeof(int) + (incdisp - n) * sizeof(int);
+
+	MPI_File_set_view(ptr, disp, mpi_type< int >(), mpi_type< int >(),
+		(char*)mpi_datarep, MPI_INFO_NULL);
+
+	status = MPI_File_write_all(ptr, (void*)tag, n,
+		mpi_type< int >(), MPI_STATUS_IGNORE);
+	if (status == MPI_SUCCESS) nstatus += n;
+
+	MPI_File_close(&ptr);
+	return (nstatus == n + 1);
+}
+
+template< typename T >
+bool nse::ptclTrackVec3d< T >::mpi_write_binary(
+	const std::string& filename, const int idx,
+	const char* mpi_datarep, const Grid3d< T >& grid, const T time) const
+{
+	return mpi_write_binary(append_index(filename, idx), mpi_datarep, grid, time);
+}
+// ----------------------------------------------------------------------------
+
+// read binary
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::ptclTrackVec3d< T >::read_binary(
+	const std::string& filename,
+	const Grid3d< T >& grid)
+{
+	FILE* ptr;
+	int nsize = -1, nstatus;
+	const int host = 0;
+
+	clear();
+
+	if (grid.mpi_com.rank == host) {	// checking header & number of particles ...
+		ptr = fopen(filename.c_str(), "rb");
+		if (ptr != NULL)
+		{
+			int header[4];
+			nstatus = fread(header, sizeof(int), 4, ptr);
+			if ((nstatus == 4) &&
+				(header[0] == 'p' + 'n' + 's' + 'e') &&
+				(header[1] == 3) &&
+				(header[2] >= 0) &&
+				(header[3] == sizeof(T)))
+			{
+				nsize = header[2];
+
+				T time_mark;
+				nstatus = fread(&time_mark, sizeof(T), 1, ptr);
+			}
+			else
+				fclose(ptr);
+		}
+	}
+	mpi_broadcast(&nsize, 1, host, grid.mpi_com.comm);
+	if (nsize == -1) return false;
+
+	// reading particles data ...
+	//
+	const int buf_size = 10 * 1024;
+
+	T* buf_real;
+	int *buf_int;
+	allocate_vnull(&buf_real, 6 * buf_size);
+	allocate_vnull(&buf_int, buf_size);
+
+	bool status = true;
+	int idx = 0, block_size;
+	while (idx < nsize)
+	{
+		block_size = min(nsize - idx, buf_size);
+		if (grid.mpi_com.rank == host)
+			nstatus = fread(buf_real, sizeof(T), 6 * block_size, ptr);
+
+		mpi_broadcast(&nstatus, 1, host, grid.mpi_com.comm);
+		if (nstatus != 6 * block_size) {
+			status = false;
+			break;
+		}
+
+		mpi_broadcast(buf_real, 6 * block_size, host, grid.mpi_com.comm);
+
+		long int fpos_base;
+		if (grid.mpi_com.rank == host)
+		{
+			fpos_base = ftell(ptr);
+			fseek(ptr,
+				6 * (nsize - idx - block_size) * sizeof(T) +
+				sizeof(int) +			// skipping current tag value
+				idx * sizeof(int), SEEK_CUR);
+
+			nstatus = fread(buf_int, sizeof(int), block_size, ptr);
+
+			fseek(ptr, fpos_base, SEEK_SET);
+		}
+
+		mpi_broadcast(&nstatus, 1, host, grid.mpi_com.comm);
+		if (nstatus != block_size) {
+			status = false;
+			break;
+		}
+
+		mpi_broadcast(buf_int, block_size, host, grid.mpi_com.comm);
+
+		for (int i = 0; i < block_size; i++) {
+			T xpos = buf_real[6 * i];
+			T ypos = buf_real[6 * i + 1];
+			T zpos = buf_real[6 * i + 2];
+
+			T uval = buf_real[6 * i + 3];
+			T vval = buf_real[6 * i + 4];
+			T wval = buf_real[6 * i + 5];
+
+			current_tag = buf_int[i];
+			add(xpos, ypos, zpos, uval, vval, wval, grid);
+		}
+
+		idx += block_size;
+	}
+	// setting current tag value
+	//	note that file pointer is at current tag value after while loop
+	if (grid.mpi_com.rank == host)
+		nstatus = fread(&current_tag, sizeof(int), 1, ptr);
+	mpi_broadcast(&current_tag, 1, host, grid.mpi_com.comm);
+
+
+	deallocate(buf_real);
+	deallocate(buf_int);
+	if (grid.mpi_com.rank == host) fclose(ptr);
+	return status;
+}
+
+template< typename T >
+bool nse::ptclTrackVec3d< T >::read_binary(
+	const std::string& filename, const int idx,
+	const Grid3d< T >& grid)
+{
+	return read_binary(append_index(filename, idx), grid);
+}
+// ----------------------------------------------------------------------------
+
+
+// initialize: particle vector class
+template class nse::ptclTrackVec3d< float >;
+template class nse::ptclTrackVec3d< double >;
+// ------------------------------------------------------------------------ //
diff --git a/ptcl-track-vec3d.h b/ptcl-track-vec3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dc5804b52860942feecb975dc9c29b933e06633
--- /dev/null
+++ b/ptcl-track-vec3d.h
@@ -0,0 +1,107 @@
+#pragma once
+
+// [ptcl-track-vec3d.h(cpp)]: 3D particles tracking vector
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "grid3d.h"
+#include "ptcl-vec3d.h"
+
+#include <string>
+
+
+
+namespace nse {
+
+	// forward declarations
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T > class trajAccum3d;
+
+	template< typename T >
+	class ptclTrackVec3d : public ptclVec3d< T > {
+	public:
+
+		friend class nse::trajAccum3d< T >;
+
+
+		// add-init particle subroutines
+		// ----------------------------------------------------------------------------
+		virtual void add(const T x, const T y, const T z,
+			const T u, const T v, const T w, const Grid3d< T >& grid);
+		virtual void add(const T x, const T y, const T z,
+			const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+			const Grid3d< T >& grid);
+
+		// get subroutines
+		// ----------------------------------------------------------------------------
+		virtual bool is_empty() const { return (current_tag == 0); }
+
+		// clear-free subroutines
+		// ----------------------------------------------------------------------------
+		virtual void clear();	// remove particles but keep memory
+		virtual void free();	// remove particle and memory
+
+								// binary I/O
+								// ----------------------------------------------------------------------------
+		virtual bool write_binary(const std::string& filename,
+			const Grid3d< T >& grid, const T time) const;
+		virtual bool write_binary(const std::string& filename, const int idx,
+			const Grid3d< T >& grid, const T time) const;
+
+		virtual bool mpi_write_binary(const std::string& filename,
+			const char* mpi_datarep, const Grid3d< T >& grid, const T time) const;
+		virtual bool mpi_write_binary(const std::string& filename, const int idx,
+			const char* mpi_datarep, const Grid3d< T >& grid, const T time) const;
+
+		virtual bool read_binary(const std::string& filename,
+			const Grid3d< T >& grid);
+		virtual bool read_binary(const std::string& filename, const int idx,
+			const Grid3d< T >& grid);
+
+		// ----------------------------------------------------------------------------
+		ptclTrackVec3d();
+		ptclTrackVec3d(const ptclTrackVec3d< T >& pvec);
+		~ptclTrackVec3d();
+
+	public:	// setting public for scatter visualization
+
+		using ptclVec3d<T>::n;
+		using ptclVec3d<T>::x; using ptclVec3d<T>::y; using ptclVec3d<T>::z;
+
+	protected:
+		int *tag;
+		int current_tag;
+
+		using ptclVec3d<T>::up; using ptclVec3d<T>::vp; using ptclVec3d<T>::wp;
+		using ptclVec3d<T>::ic; using ptclVec3d<T>::jc; using ptclVec3d<T>::kc;
+
+		using ptclVec3d<T>::mem_size;
+		using ptclVec3d<T>::mem_alloc;
+
+		using ptclVec3d<T>::mpi_get_num;
+
+	protected:
+
+		// resize memory for requested number of elements
+		// ----------------------------------------------------------------------------
+		virtual void resize(const int n);
+
+		// swap i,j particles pair
+		// ----------------------------------------------------------------------------
+		virtual void swap(const int i, const int j);
+
+
+	private:
+
+		// add(get) particle data to(from) linear memory
+		// ----------------------------------------------------------------------------
+		void add_local(const T* _RESTRICT const buf, const int* _RESTRICT const ibuf, const int nbuf);
+		void get_local(T* _RESTRICT buf, int* _RESTRICT ibuf, const int idx, const int np) const;
+
+		// mpi-exchanges
+		// ----------------------------------------------------------------------------
+		virtual int mpi_exchange_line(const int* exch_imem, const int nexch,
+			const int rank, const int size, const MPI_Comm comm, const int period);
+	};
+}
diff --git a/ptcl-vec3d.cpp b/ptcl-vec3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71f6e1ac1895c05982307d0918aa8b4382285251
--- /dev/null
+++ b/ptcl-vec3d.cpp
@@ -0,0 +1,1257 @@
+#define _CRT_SECURE_NO_WARNINGS
+#include "ptcl-vec3d.h"
+
+#include "nse-alloc.h"
+#include "vecmath.h"
+#include "mtrand.h"
+#include "str-com.h"
+
+#include <stdio.h>
+
+#ifdef USE_PTCL_EXTERNAL_LIB
+#include "ptcl-ext-stub3d.h"
+#endif
+
+
+// Implementation
+// ----------------------------------------------------------------------------
+template< typename T >
+nse::ptclVec3d< T >::ptclVec3d()
+{
+	n = 0;
+	mem_size = 0;
+
+	is_no_particles = true;
+
+#ifdef	MEASURE_PARTICLE3D_TIME
+	time.locate = (double)0;
+	time.interpolate = (double)0;
+	time.update = (double)0;
+	time.mpi_exch = (double)0;
+#endif
+
+	is_passive_transport = true;
+
+	fluid_density = (T)1.0;
+	fluid_kinematic_viscosity = (T)1e-5;
+
+	density = (T)10.0;
+	diameter = (T)1e-7;
+
+	gravity_x = (T)0;
+	gravity_y = (T)0;
+	gravity_z = (T)0;
+
+#ifdef USE_PTCL_EXTERNAL_LIB
+	ptcl_lib::set_fluid_density(fluid_density);
+	ptcl_lib::set_fluid_kinematic_viscosity(fluid_kinematic_viscosity);
+	ptcl_lib::set_gravity(gravity_x, gravity_y, gravity_z);
+#endif
+}
+
+template< typename T >
+nse::ptclVec3d< T >::ptclVec3d(
+	const ptclVec3d< T >& pvec) : 
+	n(pvec.n), mem_size(pvec.mem_size), is_no_particles(pvec.is_no_particles),
+	
+	is_passive_transport(pvec.is_passive_transport),
+
+	fluid_density(pvec.fluid_density),
+	fluid_kinematic_viscosity(pvec.fluid_kinematic_viscosity),
+
+	density(pvec.density),
+	diameter(pvec.diameter),
+
+	gravity_x(pvec.gravity_x), gravity_y(pvec.gravity_y), gravity_z(pvec.gravity_z)
+{
+	if (mem_size > 0) {
+		allocate_vnull(&x, &y, &z, mem_size);
+		allocate_vnull(&u, &v, &w, mem_size);
+		allocate_vnull(&up, &vp, &wp, mem_size);
+
+		allocate_vnull(&ic, &jc, &kc, mem_size);
+
+		allocate_vnull(&exch_imem, mem_size);
+	}
+
+	if (n > 0) {
+		// copy: (x,y,z), (u[n-1],v[n-1],w[n-1]), (i,j,k)
+		//
+		mcopy(x, pvec.x, n);
+		mcopy(y, pvec.y, n);
+		mcopy(z, pvec.z, n);
+
+		mcopy(up, pvec.up, n);
+		mcopy(vp, pvec.vp, n);
+		mcopy(wp, pvec.wp, n);
+
+		mcopy(ic, pvec.ic, n);
+		mcopy(jc, pvec.jc, n);
+		mcopy(kc, pvec.kc, n);
+	}
+}
+
+template< typename T >
+nse::ptclVec3d< T >::~ptclVec3d()
+{
+	free();
+}
+// ----------------------------------------------------------------------------
+
+// add-init particle subroutines
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclVec3d< T >::add(
+	const T xpos, const T ypos, const T zpos,
+	const T uval, const T vval, const T wval, const Grid3d< T >& grid)
+{
+	// adding particle to local MPI process only
+	int ipos = grid.locate_x(xpos),
+		jpos = grid.locate_y(ypos),
+		kpos = grid.locate_z(zpos);
+
+	if ((ipos != -1) && (jpos != -1) && (kpos != - 1))
+	{
+		resize(n + 1);	// adding memory if necessary
+
+		x[n] = xpos;
+		y[n] = ypos;
+		z[n] = zpos;
+
+		up[n] = uval;
+		vp[n] = vval;
+		wp[n] = wval;
+
+		ic[n] = ipos;
+		jc[n] = jpos;
+		kc[n] = kpos;
+
+		n++;
+	}
+
+	if (is_no_particles) {
+		int nall = n;
+		mpi_allreduce(&nall, MPI_SUM, grid.mpi_com.comm);
+		is_no_particles = (nall == 0);
+	}
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::add(
+	const T xpos, const T ypos, const T zpos,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W, const Grid3d< T >& grid)
+{
+	// adding particle to local MPI process only
+	int ipos = grid.locate_x(xpos),
+		jpos = grid.locate_y(ypos),
+		kpos = grid.locate_z(zpos);
+
+	if ((ipos != -1) && (jpos != -1) && (kpos != - 1))
+	{
+		resize(n + 1);	// adding memory if necessary
+
+		x[n] = xpos;
+		y[n] = ypos;
+		z[n] = zpos;
+
+		up[n] = grid.u_interp_local(U, xpos, ypos, zpos, ipos, jpos, kpos);
+		vp[n] = grid.v_interp_local(V, xpos, ypos, zpos, ipos, jpos, kpos);
+		wp[n] = grid.w_interp_local(W, xpos, ypos, zpos, ipos, jpos, kpos);
+
+		ic[n] = ipos;
+		jc[n] = jpos;
+		kc[n] = kpos;
+
+		n++;
+	}
+
+	if (is_no_particles) {
+		int nall = n;
+		mpi_allreduce(&nall, MPI_SUM, grid.mpi_com.comm);
+		is_no_particles = (nall == 0);
+	}
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::add_uniform(
+	const int num, const long int rand_seed,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W, const Grid3d< T >& grid)
+{
+	MTRand_closed mt;	// Mersenne Twister uniform distribution in [0,1]
+	mt.seed(rand_seed);
+
+	int m;
+	for (m = 0; m < num; m++)
+	{
+		add(grid.mpi_x + (T)mt() * grid.mpi_length,
+			grid.mpi_y + (T)mt() * grid.mpi_width,
+			grid.mpi_z + (T)mt() * grid.mpi_height,
+			U, V, W, grid);
+	}
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::add_uniform(
+	const int num, const long int rand_seed,
+	const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+	const T xmin, const T xmax,
+	const T ymin, const T ymax,
+	const T zmin, const T zmax,
+	const Grid3d< T >& grid)
+{
+	MTRand_closed mt;	// Mersenne Twister uniform distribution in [0,1]
+	mt.seed(rand_seed);
+
+	int m;
+	for (m = 0; m < num; m++)
+	{
+		add(xmin + (T)mt() * (xmax - xmin),
+			ymin + (T)mt() * (ymax - ymin),
+			zmin + (T)mt() * (zmax - zmin),
+			U, V, W, grid);
+	}
+}
+// ----------------------------------------------------------------------------
+
+// add wall
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclVec3d< T >::add_wall_plane(
+	const T x1, const T y1, const T z1,
+	const T x2, const T y2, const T z2,
+	const T x3, const T y3, const T z3,
+	const T x4, const T y4, const T z4)
+{
+#ifdef USE_PTCL_EXTERNAL_LIB
+	ptcl_lib::add_wall_plane(
+		x1, y1, z1,
+		x2, y2, z2,
+		x3, y3, z3,
+		x4, y4, z4);
+#endif
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::add_mask(const int* _RESTRICT c_mask, const Grid3d< T >& grid)
+{
+#ifdef USE_PTCL_EXTERNAL_LIB
+	ptcl_lib::add_mask(c_mask, grid.ex, grid.ey, grid.ez,
+		grid.nx, grid.ny, grid.nz,
+		grid.gcx, grid.gcy, grid.gcz);
+#endif
+}
+// ----------------------------------------------------------------------------
+
+// set calls
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclVec3d< T >::set_passive_transport(const bool mode)
+{
+	is_passive_transport = mode;
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::set_particle_density(const T _density)
+{
+	density = _density;
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::set_particle_diameter(const T _diameter)
+{
+	diameter = _diameter;
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::set_fluid_density(const T _density)
+{
+	fluid_density = _density;
+
+#ifdef USE_PTCL_EXTERNAL_LIB
+	ptcl_lib::set_fluid_density(fluid_density);
+#endif
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::set_fluid_kinematic_viscosity(const T _kinematic_viscosity)
+{
+	fluid_kinematic_viscosity = _kinematic_viscosity;
+
+#ifdef USE_PTCL_EXTERNAL_LIB
+	ptcl_lib::set_fluid_kinematic_viscosity(fluid_kinematic_viscosity);
+#endif
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::set_gravity(const T gx, const T gy, const T gz)
+{
+	gravity_x = gx;
+	gravity_y = gy;
+	gravity_z = gz;
+
+#ifdef USE_PTCL_EXTERNAL_LIB
+	ptcl_lib::set_gravity(gravity_x, gravity_y, gravity_z);
+#endif
+}
+// ----------------------------------------------------------------------------
+
+// get subroutines
+// ----------------------------------------------------------------------------
+template< typename T >
+int nse::ptclVec3d< T >::mpi_get_num(const Grid3d< T >& grid) const
+{
+	return mpi_allreduce(n, MPI_SUM, grid.mpi_com.comm);
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::get_number_concentration(T* _RESTRICT C, const Grid3d< T >& grid) const
+{
+	int m, idx;
+	
+	null(C, grid.size);
+
+#pragma omp parallel for private( m, idx ) shared(C)
+	for (m = 0; m < n; m++)
+	{
+		idx = ic[m] * grid.nyz + jc[m] * grid.nz + kc[m];
+
+		T volume =
+			(grid.ex[ic[m] + 1] - grid.ex[ic[m]]) *
+			(grid.ey[jc[m] + 1] - grid.ey[jc[m]]) *
+			(grid.ez[kc[m] + 1] - grid.ez[kc[m]]);
+
+		C[idx] = C[idx] + ((T)1.0 / volume);
+	}
+}
+// ----------------------------------------------------------------------------
+
+
+// update particle subroutines
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclVec3d< T >::update(
+	const T* _RESTRICT const Uf, const T* _RESTRICT const Vf, const T* _RESTRICT const Wf,
+	const int x_period, const int y_period, const int z_period,
+	const Grid3d< T >& grid, const T dt)
+{
+	int m;
+
+#ifdef	MEASURE_PARTICLE3D_TIME
+	double begin_time, end_time;
+#endif
+
+	// velocity interpolation
+	// --------------------------------------------------------
+#ifdef	MEASURE_PARTICLE3D_TIME
+	begin_time = omp_get_wtime();
+#endif
+
+	grid.u_interp_local(u, Uf, x, y, z, ic, jc, kc, n);
+	grid.v_interp_local(v, Vf, x, y, z, ic, jc, kc, n);
+	grid.w_interp_local(w, Wf, x, y, z, ic, jc, kc, n);
+
+#ifdef	MEASURE_PARTICLE3D_TIME
+	end_time = omp_get_wtime();
+	time.interpolate += end_time - begin_time;
+#endif
+	// --------------------------------------------------------
+
+	// particle advection
+	// --------------------------------------------------------
+#ifdef	MEASURE_PARTICLE3D_TIME
+	begin_time = omp_get_wtime();
+#endif
+
+	if (!is_passive_transport) 
+	{
+#pragma omp parallel for private( m )
+		for (m = 0; m < n; m++)
+		{
+
+#ifdef USE_PTCL_EXTERNAL_LIB
+			ptcl_lib::update_particle(
+				&x[m], &y[m], &z[m],
+				&up[m], &vp[m], &wp[m],
+				u[m], v[m], w[m],
+				density, diameter, gravity_x, gravity_y, gravity_z,
+				dt);
+#else
+			T umod = sqrt(
+				(up[m] - u[m]) * (up[m] - u[m]) +
+				(vp[m] - v[m]) * (vp[m] - v[m]) +
+				(wp[m] - w[m]) * (wp[m] - w[m]));
+			T Re = diameter * umod / fluid_kinematic_viscosity;
+
+			// --- f = Cd * Re
+			T f_drag = ((T)24.0) * ((T)1.0 + (T)0.158 * pow(Re, (T)2.0 / (T)3.0));
+			// --- drag time scale
+			T tau_drag = ((T)4.0 / (T)3.0) * ((T)1.0 / f_drag) * (density / fluid_density) * diameter * (diameter / fluid_kinematic_viscosity);
+
+			// --- additional forces
+			//	--- gravity
+			T Bu = gravity_x * ((T)1.0 - (fluid_density / density));
+			T Bv = gravity_y * ((T)1.0 - (fluid_density / density));
+			T Bw = gravity_z * ((T)1.0 - (fluid_density / density));
+
+			T upn, vpn, wpn;
+			if (dt < tau_drag) 
+			{
+				upn = (up[m] + (u[m] * (dt / tau_drag) + Bu * dt)) / ((T)1.0 + (dt / tau_drag));
+				vpn = (vp[m] + (v[m] * (dt / tau_drag) + Bv * dt)) / ((T)1.0 + (dt / tau_drag));
+				wpn = (wp[m] + (w[m] * (dt / tau_drag) + Bw * dt)) / ((T)1.0 + (dt / tau_drag));
+			}
+			else
+			{
+				upn = (u[m] + (up[m] * (tau_drag / dt) + Bu * tau_drag)) / ((T)1.0 + (tau_drag / dt));
+				vpn = (v[m] + (vp[m] * (tau_drag / dt) + Bv * tau_drag)) / ((T)1.0 + (tau_drag / dt));
+				wpn = (w[m] + (wp[m] * (tau_drag / dt) + Bw * tau_drag)) / ((T)1.0 + (tau_drag / dt));
+			}
+
+			T xpn = x[m] + (T)0.5 * dt * (upn + up[m]);
+			T ypn = y[m] + (T)0.5 * dt * (vpn + vp[m]);
+			T zpn = z[m] + (T)0.5 * dt * (wpn + wp[m]);
+
+			// --- domain wall collisions
+			if (!x_period) {
+				if ((xpn < grid.mpi_x) || (xpn > grid.mpi_x + grid.mpi_length)) { 
+					xpn = x[m]; upn = -upn; 
+				}
+			}
+			if (!y_period) {
+				if ((ypn < grid.mpi_y) || (ypn > grid.mpi_y + grid.mpi_width)) {
+					ypn = y[m]; vpn = -vpn;
+				}
+			}
+			if (!z_period) {
+				if ((zpn < grid.mpi_z) || (zpn > grid.mpi_z + grid.mpi_height)) { 
+					zpn = z[m]; wpn = -wpn; 
+				}
+			}
+
+			x[m] = xpn;
+			y[m] = ypn;
+			z[m] = zpn;
+
+			up[m] = upn;
+			vp[m] = vpn;
+			wp[m] = wpn;
+#endif
+		}
+	}
+	else
+	{
+#pragma omp parallel for private( m )
+		for (m = 0; m < n; m++)
+		{
+			x[m] += ((T)1.5 * u[m] - (T)0.5 * up[m]) * dt;
+			y[m] += ((T)1.5 * v[m] - (T)0.5 * vp[m]) * dt;
+			z[m] += ((T)1.5 * w[m] - (T)0.5 * wp[m]) * dt;
+
+			up[m] = u[m];
+			vp[m] = v[m];
+			wp[m] = w[m];
+		}
+	}
+
+#ifdef	MEASURE_PARTICLE3D_TIME
+	end_time = omp_get_wtime();
+	time.update += end_time - begin_time;
+#endif
+	// --------------------------------------------------------
+
+	// particle location on grid
+	// --------------------------------------------------------
+#ifdef	MEASURE_PARTICLE3D_TIME
+	begin_time = omp_get_wtime();
+#endif
+
+	grid.locate_local_x(x, ic, CFL_restriction, n);
+	grid.locate_local_y(y, jc, CFL_restriction, n);
+	grid.locate_local_z(z, kc, CFL_restriction, n);
+
+#ifdef	MEASURE_PARTICLE3D_TIME
+	end_time = omp_get_wtime();
+	time.locate += end_time - begin_time;
+#endif
+	// --------------------------------------------------------
+
+	// MPI exchanges
+	// --------------------------------------------------------
+#ifdef	MEASURE_PARTICLE3D_TIME
+	begin_time = omp_get_wtime();
+#endif
+
+	mpi_exchange_x(x_period, grid);
+	mpi_exchange_y(y_period, grid);
+	mpi_exchange_z(z_period, grid);
+
+#ifdef	MEASURE_PARTICLE3D_TIME
+	end_time = omp_get_wtime();
+	time.mpi_exch += end_time - begin_time;
+#endif
+	// --------------------------------------------------------
+}
+// ----------------------------------------------------------------------------
+
+// clear-free subroutines
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclVec3d< T >::clear()
+{
+	n = 0;
+	is_no_particles = true;
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::free()
+{
+	if (mem_size > 0) {
+		deallocate(x, y, z);
+		deallocate(u, v, w);
+		deallocate(up, vp, wp);
+
+		deallocate(ic, jc, kc);
+
+		deallocate(exch_imem);
+		mem_size = 0;
+	}
+
+	n = 0;
+	is_no_particles = true;
+}
+// ----------------------------------------------------------------------------
+
+
+// PRIVATE:
+// ----------------------------------------------------------------------------
+
+// resize memory for requested number of elements
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclVec3d< T >::resize(const int m)
+{
+	if (m > mem_size)
+	{
+		T *x_mem, *y_mem, *z_mem;
+		T *up_mem, *vp_mem, *wp_mem;
+		int *ic_mem, *jc_mem, *kc_mem;
+
+		const int nalloc = max(m, mem_size + mem_alloc);
+
+		allocate_vnull(&x_mem, &y_mem, &z_mem, nalloc);
+		allocate_vnull(&up_mem, &vp_mem, &wp_mem, nalloc);
+		allocate_vnull(&ic_mem, &jc_mem, &kc_mem, nalloc);
+
+		if (n > 0) {
+			mcopy(x_mem, x, n);
+			mcopy(y_mem, y, n);
+			mcopy(z_mem, z, n);
+
+			mcopy(up_mem, up, n);
+			mcopy(vp_mem, vp, n);
+			mcopy(wp_mem, wp, n);
+
+			mcopy(ic_mem, ic, n);
+			mcopy(jc_mem, jc, n);
+			mcopy(kc_mem, kc, n);
+		}
+		if (mem_size > 0) {
+			deallocate(x, y, z);
+			deallocate(u, v, w);
+			deallocate(up, vp, wp);
+
+			deallocate(ic, jc, kc);
+
+			deallocate(exch_imem);
+		}
+
+		x = x_mem; y = y_mem; z = z_mem;
+		up = up_mem; vp = vp_mem; wp = wp_mem;
+
+		ic = ic_mem; jc = jc_mem; kc = kc_mem;
+
+		mem_size = nalloc;
+
+		allocate_vnull(&u, &v, &w, mem_size);
+		allocate_vnull(&exch_imem, mem_size);
+	}
+}
+// ----------------------------------------------------------------------------
+
+// swap i,j particles pair
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclVec3d< T >::swap(const int i, const int j)
+{
+	T _x = x[i]; x[i] = x[j]; x[j] = _x;
+	T _y = y[i]; y[i] = y[j]; y[j] = _y;
+	T _z = z[i]; z[i] = z[j]; z[j] = _z;
+
+	T _up = up[i]; up[i] = up[j]; up[j] = _up;
+	T _vp = vp[i]; vp[i] = vp[j]; vp[j] = _vp;
+	T _wp = wp[i]; wp[i] = wp[j]; wp[j] = _wp;
+
+	int _ic = ic[i]; ic[i] = ic[j]; ic[j] = _ic;
+	int _jc = jc[i]; jc[i] = jc[j]; jc[j] = _jc;
+	int _kc = kc[i]; kc[i] = kc[j]; kc[j] = _kc;
+}
+// ----------------------------------------------------------------------------
+
+// add(get) particle data to(from) linear memory
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::ptclVec3d< T >::add_local(
+	const T* _RESTRICT const buf, const int nbuf)
+{
+	resize(n + nbuf);
+
+	for (int i = 0; i < nbuf; i++) {
+
+		x[n] = buf[6 * i];
+		y[n] = buf[6 * i + 1];
+		z[n] = buf[6 * i + 2];
+
+		up[n] = buf[6 * i + 3];
+		vp[n] = buf[6 * i + 4];
+		wp[n] = buf[6 * i + 5];
+
+		n++;
+	}
+}
+
+template< typename T >
+void nse::ptclVec3d< T >::get_local(
+	T* _RESTRICT buf, const int idx, const int np) const
+{
+	for (int i = 0; i < np; i++) 
+	{
+		buf[6 * i] = x[idx + i];
+		buf[6 * i + 1] = y[idx + i];
+		buf[6 * i + 2] = z[idx + i];
+
+		buf[6 * i + 3] = up[idx + i];
+		buf[6 * i + 4] = vp[idx + i];
+		buf[6 * i + 5] = wp[idx + i];
+	}
+}
+// ----------------------------------------------------------------------------
+
+// special call to check if p belongs to segment [a,b]
+//   =  0 : inside 
+//   = -1 : outside (p < a)
+//   =  1 : outside (p > b)
+// ----------------------------------------------------------------------------
+template< typename T >
+int nse::ptclVec3d< T >::is_inside_x(
+	const T px, const Grid3d< T >& grid) const
+{
+	if (px > grid.x + grid.length) return 1;
+	if ((px < grid.x) ||
+		((grid.mpi_com.rank_x > 0) && (px <= grid.x))) return -1;
+
+	return 0;
+}
+
+template< typename T >
+int nse::ptclVec3d< T >::is_inside_y(
+	const T py, const Grid3d< T >& grid) const
+{
+	if (py > grid.y + grid.width) return 1;
+	if ((py < grid.y) ||
+		((grid.mpi_com.rank_y > 0) && (py <= grid.y))) return -1;
+
+	return 0;
+}
+
+template< typename T >
+int nse::ptclVec3d< T >::is_inside_z(
+	const T pz, const Grid3d< T >& grid) const
+{
+	if (pz > grid.z + grid.height) return 1;
+	if ((pz < grid.z) ||
+		((grid.mpi_com.rank_z > 0) && (pz <= grid.z))) return -1;
+
+	return 0;
+}
+// ----------------------------------------------------------------------------
+
+// mpi-exchange particles sorting :
+//   moving particles marked for exchange to the back of the array
+// ----------------------------------------------------------------------------
+template< typename T >
+int nse::ptclVec3d< T >::exch_sort(int* exch_mark)
+{
+	int ptr = 0, nptr = n - 1;
+	while (ptr <= nptr) {
+
+		// pointing to swap ...
+		while (nptr >= 0) {
+			if (exch_mark[nptr] == 0) break;
+			nptr--;
+		}
+		while (ptr <= nptr) {
+			if (exch_mark[ptr] != 0) break;
+			ptr++;
+		}
+
+		if (ptr < nptr) {	// swap data ...
+			swap(ptr, nptr);
+
+			int mark = exch_mark[ptr];
+			exch_mark[ptr] = exch_mark[nptr];
+			exch_mark[nptr] = mark;
+		}
+	}
+
+	return n - nptr - 1;
+}
+// ----------------------------------------------------------------------------
+
+// mpi-exchanges
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::ptclVec3d< T >::mpi_exchange_x(const int x_period, const Grid3d< T >& grid)
+{
+	// mark -x exchange particles
+	int m, error_flag = 0;
+#pragma omp parallel for private( m ) reduction( + : error_flag )
+	for (m = 0; m < n; m++)
+	{
+		if (ic[m] == -1) {	// outside particle
+			exch_imem[m] = is_inside_x(x[m], grid);
+			if (exch_imem[m] == 0) error_flag++;	// signal error
+		}
+		else
+			exch_imem[m] = 0;
+	}
+	if (error_flag) return false;	// failed to process some out of bounds particles
+
+	int nexch = exch_sort(exch_imem);
+
+	if (x_period) {	// periodic b.c.
+		// using <=, >= to "account" for finite precision effects
+		for (m = n - nexch; m < n; m++) {
+			if (x[m] <= grid.mpi_x) {
+				x[m] += grid.mpi_length; 
+				continue;
+			}
+			if (x[m] >= grid.mpi_x + grid.mpi_length) {
+				x[m] -= grid.mpi_length; 
+				continue;
+			}
+		}
+	}
+
+	int nadd = mpi_exchange_line(exch_imem, nexch,
+		grid.mpi_com.rank_x, grid.mpi_com.size_x, grid.mpi_com.comm_x, x_period);
+
+	// post-processing of particles added to local MPI process [n - nadd, n)
+	for (int m = n - nadd; m < n; m++) {
+		// define grid coordinates
+		ic[m] = grid.locate_x(x[m]);
+		jc[m] = grid.locate_y(y[m]);
+		kc[m] = grid.locate_z(z[m]);
+	}
+
+	return true;
+}
+
+template< typename T >
+bool nse::ptclVec3d< T >::mpi_exchange_y(const int y_period, const Grid3d< T >& grid)
+{
+	// mark -y exchange particles
+	int m, error_flag = 0;
+#pragma omp parallel for private( m ) reduction( + : error_flag )
+	for (m = 0; m < n; m++)
+	{
+		if (jc[m] == -1) {	// outside particle
+			exch_imem[m] = is_inside_y(y[m], grid);
+			if (exch_imem[m] == 0) error_flag++;	// signal error
+		}
+		else
+			exch_imem[m] = 0;
+	}
+	if (error_flag) return false;	// failed to process some out of bounds particles
+
+	int nexch = exch_sort(exch_imem);
+
+	if (y_period) {	// periodic b.c.
+					// using <=, >= to "account" for finite precision effects
+		for (int i = n - nexch; i < n; i++) {
+			if (y[i] <= grid.mpi_y) {
+				y[i] += grid.mpi_width;
+				continue;
+			}
+			if (y[i] >= grid.mpi_y + grid.mpi_width) {
+				y[i] -= grid.mpi_width;
+				continue;
+			}
+		}
+	}
+
+	int nadd = mpi_exchange_line(exch_imem, nexch,
+		grid.mpi_com.rank_y, grid.mpi_com.size_y, grid.mpi_com.comm_y, y_period);
+
+	// post-processing of particles added to local MPI process [n - nadd, n)
+	for (int m = n - nadd; m < n; m++) {
+		// define grid coordinates
+		ic[m] = grid.locate_x(x[m]);
+		jc[m] = grid.locate_y(y[m]);
+		kc[m] = grid.locate_z(z[m]);
+	}
+
+	return true;
+}
+
+template< typename T >
+bool nse::ptclVec3d< T >::mpi_exchange_z(const int z_period, const Grid3d< T >& grid)
+{
+	// mark -z exchange particles
+	int m, error_flag = 0;
+#pragma omp parallel for private( m ) reduction( + : error_flag )
+	for (m = 0; m < n; m++)
+	{
+		if (kc[m] == -1) {	// outside particle
+			exch_imem[m] = is_inside_z(z[m], grid);
+			if (exch_imem[m] == 0) error_flag++;	// signal error
+		}
+		else
+			exch_imem[m] = 0;
+	}
+	if (error_flag) return false;	// failed to process some out of bounds particles
+
+	int nexch = exch_sort(exch_imem);
+
+	if (z_period) {	// periodic b.c.
+		// using <=, >= to "account" for finite precision effects
+		for (int i = n - nexch; i < n; i++) {
+			if (z[i] <= grid.mpi_z) {
+				z[i] += grid.mpi_height;
+				continue;
+			}
+			if (z[i] >= grid.mpi_z + grid.mpi_height) {
+				z[i] -= grid.mpi_height;
+				continue;
+			}
+		}
+	}
+
+	int nadd = mpi_exchange_line(exch_imem, nexch,
+		grid.mpi_com.rank_z, grid.mpi_com.size_z, grid.mpi_com.comm_z, z_period);
+
+	// post-processing of particles added to local MPI process [n - nadd, n)
+	for (int m = n - nadd; m < n; m++) {
+		// define grid coordinates
+		ic[m] = grid.locate_x(x[m]);
+		jc[m] = grid.locate_y(y[m]);
+		kc[m] = grid.locate_z(z[m]);
+	}
+
+	return true;
+}
+
+template< typename T >
+int nse::ptclVec3d< T >::mpi_exchange_line(
+	const int* exch_imem, const int nexch,
+	const int rank, const int size, const MPI_Comm comm, const int period)
+{
+	T *sbuf[2], *rbuf[2];				// send-recv buffers
+
+	int sbuf_id[2], rbuf_id[2];		// buffers id's for memory handling
+
+	int nsend[2], nrecv[2];		// number of send-recv elements
+	MPI_Request mpi_req[8];
+
+
+	if (size == 1) {	// handling degenerate case
+		if (!period) {
+			n = n - nexch;	// simply removing particles
+			return 0;
+		}
+		return nexch;
+	}
+
+
+	const bool is_exch_left = (size > 1) &&
+		((period) || (rank > 0));				// "left" exchange flag
+	const bool is_exch_right = (size > 1) &&
+		((period) || (rank < size - 1));		// "right" exchange flag
+
+	const int pidx_left = (rank > 0) ? rank - 1 : size - 1;
+	const int pidx_right = (rank < size - 1) ? rank + 1 : 0;
+
+	for (int i = 0; i < 8; i++)
+		mpi_req[i] = MPI_REQUEST_NULL;
+
+	sbuf[0] = NULL; sbuf[1] = NULL;
+	rbuf[0] = NULL; rbuf[1] = NULL;
+
+	nrecv[0] = 0; nrecv[1] = 0;
+	nsend[0] = 0; nsend[1] = 0;
+
+
+	if (is_exch_left)
+	{
+		MPI_Irecv(&nrecv[0], 1, MPI_INT, pidx_left, 0, comm, &mpi_req[0]);
+
+		for (int i = n - nexch; i < n; i++)
+			if (exch_imem[i] == -1) nsend[0]++;
+
+		if (nsend[0] > 0) {
+			sbuf_id[0] = memStx::get_buf(&sbuf[0], 6 * nsend[0]);
+		}
+
+		MPI_Isend(&nsend[0], 1, MPI_INT, pidx_left, 0, comm, &mpi_req[2]);
+
+		int idx = 0;
+		for (int i = n - nexch; i < n; i++) {
+			if (exch_imem[i] == -1) {
+				get_local(&sbuf[0][6 * idx], i, 1);
+				idx++;
+			}
+		}
+	}
+	if (is_exch_right)
+	{
+		MPI_Irecv(&nrecv[1], 1, MPI_INT, pidx_right, 0, comm, &mpi_req[1]);
+
+		for (int i = n - nexch; i < n; i++)
+			if (exch_imem[i] == 1) nsend[1]++;
+
+		if (nsend[1] > 0) {
+			sbuf_id[1] = memStx::get_buf(&sbuf[1], 6 * nsend[1]);
+		}
+
+		MPI_Isend(&nsend[1], 1, MPI_INT, pidx_right, 0, comm, &mpi_req[3]);
+
+		int idx = 0;
+		for (int i = n - nexch; i < n; i++) {
+			if (exch_imem[i] == 1) {
+				get_local(&sbuf[1][6 * idx], i, 1);
+				idx++;
+			}
+		}
+	}
+
+	MPI_Waitall(4, mpi_req, MPI_STATUSES_IGNORE);
+
+	if (nsend[0] > 0)
+		MPI_Isend(sbuf[0], 6 * nsend[0], mpi_type< T >(), pidx_left, 1, comm, &mpi_req[4]);
+
+	if (nrecv[0] > 0) {
+		rbuf_id[0] = memStx::get_buf(&rbuf[0], 6 * nrecv[0]);
+		MPI_Irecv(rbuf[0], 6 * nrecv[0], mpi_type< T >(), pidx_left, 1, comm, &mpi_req[6]);
+	}
+
+	if (nsend[1] > 0)
+		MPI_Isend(sbuf[1], 6 * nsend[1], mpi_type< T >(), pidx_right, 1, comm, &mpi_req[5]);
+
+	if (nrecv[1] > 0) {
+		rbuf_id[1] = memStx::get_buf(&rbuf[1], 6 * nrecv[1]);
+		MPI_Irecv(rbuf[1], 6 * nrecv[1], mpi_type< T >(), pidx_right, 1, comm, &mpi_req[7]);
+	}
+
+	MPI_Waitall(4, &mpi_req[4], MPI_STATUSES_IGNORE);
+
+	n = n - nexch;
+	if (nrecv[0] > 0) add_local(rbuf[0], nrecv[0]);
+	if (nrecv[1] > 0) add_local(rbuf[1], nrecv[1]);
+
+	if (nrecv[0] > 0) memStx::free_buf(rbuf_id[0]);
+	if (nrecv[1] > 0) memStx::free_buf(rbuf_id[1]);
+	if (nsend[0] > 0) memStx::free_buf(sbuf_id[0]);
+	if (nsend[1] > 0) memStx::free_buf(sbuf_id[1]);
+
+	return nrecv[0] + nrecv[1];
+}
+// ----------------------------------------------------------------------------
+
+
+// PUBLIC (I/O subroutines):
+// ----------------------------------------------------------------------------
+
+// tecplot output
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::ptclVec3d< T >::write_tecplot(
+	const std::string& filename,
+	const Grid3d< T >& grid, const T time) const
+{
+	const int host = 0;
+	int nall = mpi_get_num(grid);
+
+	T *xout, *yout, *zout;
+	if (grid.mpi_com.rank == host)
+		allocate_vnull(&xout, &yout, &zout, nall);
+
+	mpi_gather_vec(x, n, xout, host);
+	mpi_gather_vec(y, n, yout, host);
+	mpi_gather_vec(z, n, zout, host);
+
+
+	int status = 0;
+	if (grid.mpi_com.rank == host)
+	{
+		FILE* ptr = fopen(filename.c_str(), "w");
+		if (ptr != NULL) {
+			fprintf(ptr, " TITLE = \"Particles 3D\"\n");
+			fprintf(ptr, " VARIABLES = \"X\", \"Y\", \"Z\"\n");
+			fprintf(ptr, " ZONE I = %i, DATAPACKING = POINT, SOLUTIONTIME = %f\n", nall, time);
+
+			for (int i = 0; i < nall; i++)
+				fprintf(ptr, "%f %f %f\n", xout[i], yout[i], zout[i]);
+
+			fclose(ptr);
+			status = 1;
+		}
+
+		deallocate(xout, yout, zout);
+	}
+
+	mpi_broadcast(&status, 1, host, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< typename T >
+bool nse::ptclVec3d< T >::write_tecplot(
+	const std::string& filename, const int idx,
+	const Grid3d< T >& grid, const T time) const
+{
+	return write_tecplot(append_index(filename, idx), grid, time);
+}
+// ----------------------------------------------------------------------------
+
+// write binary
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::ptclVec3d< T >::write_binary(
+	const std::string& filename,
+	const Grid3d< T >& grid, const T time) const
+{
+#ifdef _NSE_MPI_IO	// forcing MPI-IO...
+	bool mpi_io_status = mpi_write_binary(filename, _NSE_MPI_IO_DATAREP_DEFAULT, grid, time);
+#ifndef _NSE_MPI_IO_RETRY_SEQ
+	return mpi_io_status;
+#else
+	if (mpi_io_status) return true;
+#endif
+#endif
+
+#if (!defined(_NSE_MPI_IO) || \
+	(defined(_NSE_MPI_IO) && defined(_NSE_MPI_IO_RETRY_SEQ)))
+
+	T *buf_in, *buf_out;
+
+	if (n > 0) allocate_vnull(&buf_in, 6 * n);
+	get_local(buf_in, 0, n);
+
+	const int host = 0;
+	int nall = mpi_get_num(grid);
+
+	if (grid.mpi_com.rank == host)
+		if (nall > 0) allocate_vnull(&buf_out, 6 * nall);
+
+	mpi_gather_vec(buf_in, 6 * n, buf_out, host);
+	if (n > 0) deallocate(buf_in);
+
+	int status = 0;
+	if (grid.mpi_com.rank == host)
+	{
+		FILE* ptr = fopen(filename.c_str(), "wb");
+		if (ptr != NULL)
+		{
+			int header[4] = {
+				'p' + 'n' + 's' + 'e',	// binary id
+				3,						// number of dimensions
+				nall,					// number of particles
+				sizeof(T)				// data type size
+			};
+			T time_mark = time;
+
+			fwrite(header, sizeof(int), 4, ptr);
+			fwrite(&time_mark, sizeof(T), 1, ptr);
+
+			if (nall > 0)
+				fwrite(buf_out, sizeof(T), 6 * nall, ptr);
+
+			fclose(ptr);
+			status = 1;
+		}
+
+		if (nall > 0) deallocate(buf_out);
+	}
+
+	mpi_broadcast(&status, 1, host, grid.mpi_com.comm);
+	return (status == 1);
+#endif
+}
+
+template< typename T >
+bool nse::ptclVec3d< T >::write_binary(
+	const std::string& filename, const int idx,
+	const Grid3d< T >& grid, const T time) const
+{
+	return write_binary(append_index(filename, idx), grid, time);
+}
+// ----------------------------------------------------------------------------
+
+// MPI write binary
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::ptclVec3d< T >::mpi_write_binary(
+	const std::string& filename,
+	const char* mpi_datarep, const Grid3d< T >& grid, const T time) const
+{
+	MPI_File ptr;
+	int status = MPI_File_open(grid.mpi_com.comm, (char*)filename.c_str(),
+		MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &ptr);
+	if (status != MPI_SUCCESS) return false;	// MPI file open failure
+
+	const int host = 0;
+	const int hsize = 4 * sizeof(int) + sizeof(T);
+	int nall = mpi_get_num(grid);
+
+	int nstatus = 0;
+	if (grid.mpi_com.rank == host)
+	{
+		int header[4] = {		// header
+			'p' + 'n' + 's' + 'e',	// binary id
+			3,						// number of dimensions
+			nall,					// number of particles
+			sizeof(T)				// data type size
+		};
+
+		T time_mark = time;
+
+		status = MPI_File_write(ptr, (void*)header, 4, MPI_INT, MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 4;
+		status = MPI_File_write(ptr, &time_mark, 1, mpi_type< T >(), MPI_STATUS_IGNORE);
+		if (status == MPI_SUCCESS) nstatus += 1;
+	}
+	MPI_File_sync(ptr);
+	mpi_broadcast(&nstatus, 1, host, grid.mpi_com.comm);
+
+	// main data //
+	int np = n, incdisp;
+	MPI_Scan(&np, &incdisp, 1, MPI_INT, MPI_SUM, grid.mpi_com.comm);
+	MPI_Offset disp = hsize + 6 * (incdisp - n) * sizeof(T);
+
+	MPI_File_set_view(ptr, disp, mpi_type< T >(), mpi_type< T >(),
+		(char*)mpi_datarep, MPI_INFO_NULL);
+
+	T *buf;		// linear buffer
+	if (n > 0) allocate_vnull(&buf, 6 * n);
+	get_local(buf, 0, n);
+
+	status = MPI_File_write_all(ptr, (void*)buf, 6 * n,
+		mpi_type< T >(), MPI_STATUS_IGNORE);
+	if (status == MPI_SUCCESS) nstatus += 6 * n;
+
+	if (n > 0) deallocate(buf);
+
+	MPI_File_close(&ptr);
+	return (nstatus == 5 + 6 * n);
+}
+
+template< typename T >
+bool nse::ptclVec3d< T >::mpi_write_binary(
+	const std::string& filename, const int idx,
+	const char* mpi_datarep, const Grid3d< T >& grid, const T time) const
+{
+	return mpi_write_binary(append_index(filename, idx), mpi_datarep, grid, time);
+}
+// ----------------------------------------------------------------------------
+
+// read binary
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::ptclVec3d< T >::read_binary(
+	const std::string& filename,
+	const Grid3d< T >& grid)
+{
+	FILE* ptr;
+	int nsize = -1, nstatus;
+	const int host = 0;
+
+	clear();
+
+	if (grid.mpi_com.rank == host) {	// checking header & number of particles ...
+		ptr = fopen(filename.c_str(), "rb");
+		if (ptr != NULL)
+		{
+			int header[4];
+			nstatus = fread(header, sizeof(int), 4, ptr);
+			if ((nstatus == 4) &&
+				(header[0] == 'p' + 'n' + 's' + 'e') &&
+				(header[1] == 3) &&
+				(header[2] >= 0) &&
+				(header[3] == sizeof(T)))
+			{
+				nsize = header[2];
+
+				T time_mark;
+				nstatus = fread(&time_mark, sizeof(T), 1, ptr);
+			}
+			else
+				fclose(ptr);
+		}
+	}
+	mpi_broadcast(&nsize, 1, host, grid.mpi_com.comm);
+	if (nsize == -1) return false;
+
+	// reading particles data ...
+	//
+	const int buf_size = 10 * 1024;
+
+	T* buf;
+	allocate_vnull(&buf, 6 * buf_size);
+
+	bool status = true;
+	int idx = 0, block_size;
+	while (idx < nsize)
+	{
+		block_size = min(nsize - idx, buf_size);
+		if (grid.mpi_com.rank == host)
+			nstatus = fread(buf, sizeof(T), 6 * block_size, ptr);
+
+		mpi_broadcast(&nstatus, 1, host, grid.mpi_com.comm);
+		if (nstatus != 6 * block_size) {
+			status = false;
+			break;
+		}
+
+		mpi_broadcast(buf, 6 * block_size, host, grid.mpi_com.comm);
+
+		for (int i = 0; i < block_size; i++) {
+			T xpos = buf[6 * i];
+			T ypos = buf[6 * i + 1];
+			T zpos = buf[6 * i + 2];
+
+			T uval = buf[6 * i + 3];
+			T vval = buf[6 * i + 4];
+			T wval = buf[6 * i + 5];
+
+			add(xpos, ypos, zpos, uval, vval, wval, grid);
+		}
+
+		idx += block_size;
+	}
+
+	deallocate(buf);
+	if (grid.mpi_com.rank == host) fclose(ptr);
+	return status;
+}
+
+template< typename T >
+bool nse::ptclVec3d< T >::read_binary(
+	const std::string& filename, const int idx,
+	const Grid3d< T >& grid)
+{
+	return read_binary(append_index(filename, idx), grid);
+}
+// ----------------------------------------------------------------------------
+
+// initialize: particle vector class
+template class nse::ptclVec3d< float >;
+template class nse::ptclVec3d< double >;
+// ------------------------------------------------------------------------ //
diff --git a/ptcl-vec3d.h b/ptcl-vec3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c3cd597fd1efbcdb45a9f770e668c340297e2b3
--- /dev/null
+++ b/ptcl-vec3d.h
@@ -0,0 +1,201 @@
+#pragma once
+
+// [ptcl-vec3d.h(cpp)]: 3D particles vector
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "grid3d.h"
+
+#include <string>
+
+#define MEASURE_PARTICLE3D_TIME
+
+// --- key to use external lib
+//#define USE_PTCL_EXTERNAL_LIB
+
+
+namespace nse {
+
+	template< typename T >
+	class ptclVec3d {
+	public:
+
+		// add-init particle subroutines
+		// ----------------------------------------------------------------------------
+		virtual void add(const T x, const T y, const T z,
+			const T u, const T v, const T w, const Grid3d< T >& grid);
+		virtual void add(const T x, const T y, const T z,
+			const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W, const Grid3d< T >& grid);
+		
+		void add_uniform(const int num, const long int seed,
+			const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W, const Grid3d< T >& grid);
+		void add_uniform(const int num, const long int seed,
+			const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+			const T xmin, const T xmax,
+			const T ymin, const T ymax, 
+			const T zmin, const T zmax, const Grid3d< T >& grid);
+
+		// add wall
+		// ----------------------------------------------------------------------------
+		void add_wall_plane(
+			const T x1, const T y1, const T z1,
+			const T x2, const T y2, const T z2,
+			const T x3, const T y3, const T z3,
+			const T x4, const T y4, const T z4);
+
+		void add_mask(const int* _RESTRICT c_mask, const Grid3d< T >& grid);
+
+		// set calls
+		// ----------------------------------------------------------------------------
+		void set_passive_transport(const bool mode);		// passive particle transport by default = true
+
+		void set_particle_density(const T density);
+		void set_particle_diameter(const T diameter);
+
+		void set_fluid_density(const T density);
+		void set_fluid_kinematic_viscosity(const T kinematic_viscosity);
+
+		void set_gravity(const T gx, const T gy, const T gz);
+
+		// get subroutines
+		// ----------------------------------------------------------------------------
+		int mpi_get_num(const Grid3d< T >& grid) const;
+
+		bool is_empty() const { return is_no_particles; }
+
+		// get number concentration
+		// ----------------------------------------------------------------------------
+		void get_number_concentration(T* _RESTRICT C, const Grid3d< T >& grid) const;
+
+		// update particle subroutines
+		// ----------------------------------------------------------------------------
+		void update(const T* _RESTRICT const U, const T* _RESTRICT const V, const T* _RESTRICT const W,
+			const int x_period, const int y_period, const int z_period,
+			const Grid3d< T >& grid, const T dt);
+
+		// clear-free subroutines
+		// ----------------------------------------------------------------------------
+		virtual void clear();	// remove particles but keep memory
+		virtual void free();	// remove particle and memory
+
+		// tecplot output
+		// ----------------------------------------------------------------------------
+		bool write_tecplot(const std::string& filename,
+			const Grid3d< T >& grid, const T time) const;
+		bool write_tecplot(const std::string& filename, const int idx,
+			const Grid3d< T >& grid, const T time) const;
+
+		// binary I/O
+		// ----------------------------------------------------------------------------
+		virtual bool write_binary(const std::string& filename,
+			const Grid3d< T >& grid, const T time) const;
+		virtual bool write_binary(const std::string& filename, const int idx,
+			const Grid3d< T >& grid, const T time) const;
+
+		virtual bool mpi_write_binary(const std::string& filename,
+			const char* mpi_datarep, const Grid3d< T >& grid, const T time) const;
+		virtual bool mpi_write_binary(const std::string& filename, const int idx,
+			const char* mpi_datarep, const Grid3d< T >& grid, const T time) const;
+
+		virtual bool read_binary(const std::string& filename,
+			const Grid3d< T >& grid);
+		virtual bool read_binary(const std::string& filename, const int idx,
+			const Grid3d< T >& grid);
+
+
+		// ----------------------------------------------------------------------------
+		ptclVec3d();
+		ptclVec3d(const ptclVec3d< T >& pvec);
+		virtual ~ptclVec3d();
+
+	public:
+
+#ifdef MEASURE_PARTICLE3D_TIME
+		struct
+		{
+			double locate, interpolate;
+			double update, mpi_exch;
+		} time;
+#endif
+
+	public:	// setting public for scatter visualization
+
+		static const int CFL_restriction = 1;	// grid cells advection upper bound
+
+		int n;				// number of particles
+		T *x, *y, *z;		// coordinates
+
+	protected:
+
+		T *u, *v, *w;		// working arrays for interpolated velocity
+		T *up, *vp, *wp;	// particle velocity at (n - 1) time step
+
+		int *ic, *jc, *kc;	// current grid index coordinates
+
+		bool is_no_particles;	// no particles present - global flag
+
+		int mem_size;						// allocated memory size
+		static const int mem_alloc = 128;	// minimum memory allocation block
+
+		int *exch_imem;			// working memory for mpi exchanges
+
+
+		bool is_passive_transport;		// passive particles key
+
+		T density, diameter;			// particle density & diameter
+
+		T fluid_density;				// density of fluid
+		T fluid_kinematic_viscosity;	// kinematic viscosity of fluid
+
+		T gravity_x, gravity_y, gravity_z;		// gravity acceleration vector
+
+
+	protected:
+
+		// resize memory for requested number of elements
+		// ----------------------------------------------------------------------------
+		virtual void resize(const int n);
+
+		// swap i,j particles pair
+		// ----------------------------------------------------------------------------
+		virtual void swap(const int i, const int j);
+
+	private:
+
+		// add(get) particle data to(from) linear memory
+		// ----------------------------------------------------------------------------
+		void add_local(const T* _RESTRICT const buf, const int nbuf);
+		void get_local(T* _RESTRICT buf, const int idx, const int np) const;
+
+
+	protected:
+
+		// special call to check if point belongs to local domain segment
+		//   =  0 : inside 
+		//   = -1 : outside (p < domain)
+		//   =  1 : outside (p > domain)
+		// ----------------------------------------------------------------------------
+		int is_inside_x(const T x, const Grid3d< T >& grid) const;
+		int is_inside_y(const T y, const Grid3d< T >& grid) const;
+		int is_inside_z(const T z, const Grid3d< T >& grid) const;
+
+	private:
+
+		// mpi-exchange particles sorting
+		//   : moving particles marked for exchange to the back of the array
+		//   : returns [number of particles for exchange]
+		// ----------------------------------------------------------------------------
+		int exch_sort(int* exch_mark);
+
+		// mpi-exchanges
+		// ----------------------------------------------------------------------------
+		bool mpi_exchange_x(const int x_period, const Grid3d< T >& grid);
+		bool mpi_exchange_y(const int y_period, const Grid3d< T >& grid);
+		bool mpi_exchange_z(const int z_period, const Grid3d< T >& grid);
+
+
+		virtual int mpi_exchange_line(const int* exch_imem, const int nexch,
+			const int rank, const int size, const MPI_Comm comm, const int period);
+	};
+}
diff --git a/stats-data.cpp b/stats-data.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..51d5ecfa0ca3f6f7019646f23d0c048e85789d4c
--- /dev/null
+++ b/stats-data.cpp
@@ -0,0 +1,1095 @@
+#include "stats-data.h"
+
+#include "time-slice3d.h"
+
+using namespace nse::nse_const3d;
+
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+statsData::statsData() {}
+statsData::~statsData() {}
+// -------------------------------------------------------------------------------------------- //
+
+
+template< typename T >
+void statsData::init(const Grid3d< T >& grid)
+{
+	U.set(grid.nz, psum_size);
+	V.set(grid.nz, psum_size);
+	W.set(grid.nz, psum_size);
+	P.set(grid.nz, psum_size);
+
+	U2_u.set(grid.nz, psum_size);
+	V2_v.set(grid.nz, psum_size);
+	W2_w.set(grid.nz, psum_size);
+
+	U2_uw.set(grid.nz, psum_size);
+	V2_vw.set(grid.nz, psum_size);
+	W2_c.set(grid.nz, psum_size);
+
+	W2_u.set(grid.nz, psum_size);
+	W2_v.set(grid.nz, psum_size);
+	W2_uw.set(grid.nz, psum_size);
+	W2_vw.set(grid.nz, psum_size);
+	
+	UV.set(grid.nz, psum_size);
+	UW.set(grid.nz, psum_size);
+	VW.set(grid.nz, psum_size);
+	UV_uvw.set(grid.nz, psum_size);
+	UW_uvw.set(grid.nz, psum_size);
+	VW_uvw.set(grid.nz, psum_size);
+
+	PU.set(grid.nz, psum_size);
+	PV.set(grid.nz, psum_size);
+	PW.set(grid.nz, psum_size);
+
+	UW_bottom.set(grid.nz, psum_size);
+	UW_top.set(grid.nz, psum_size);
+	VW_bottom.set(grid.nz, psum_size);
+	VW_top.set(grid.nz, psum_size);
+	UW_bottom_uv.set(grid.nz, psum_size);
+	UW_top_uv.set(grid.nz, psum_size);
+	VW_bottom_uv.set(grid.nz, psum_size);
+	VW_top_uv.set(grid.nz, psum_size);
+	UW_bottom_uw.set(grid.nz, psum_size);
+	UW_top_uw.set(grid.nz, psum_size);
+	VW_bottom_vw.set(grid.nz, psum_size);
+	VW_top_vw.set(grid.nz, psum_size);
+
+	UW_adv.set(grid.nz, psum_size);
+	VW_adv.set(grid.nz, psum_size);
+	
+	U2W.set(grid.nz, psum_size);
+	V2W.set(grid.nz, psum_size);
+	W2W.set(grid.nz, psum_size);
+
+	UVW.set(grid.nz, psum_size);
+	UWW.set(grid.nz, psum_size);
+	VWW.set(grid.nz, psum_size);
+	
+	U_diss.set(grid.nz, psum_size);
+	V_diss.set(grid.nz, psum_size);
+	W_diss.set(grid.nz, psum_size);
+
+	UV_diss.set(grid.nz, psum_size);
+	UW_diss.set(grid.nz, psum_size);
+	VW_diss.set(grid.nz, psum_size);
+
+	U_iso_diss.set(grid.nz, psum_size);
+	V_iso_diss.set(grid.nz, psum_size);
+	W_iso_diss.set(grid.nz, psum_size);
+
+	UV_iso_diss.set(grid.nz, psum_size);
+	UW_iso_diss.set(grid.nz, psum_size);
+	VW_iso_diss.set(grid.nz, psum_size);
+
+	PSuu.set(grid.nz, psum_size);
+	PSvv.set(grid.nz, psum_size);
+	PSww.set(grid.nz, psum_size);
+	P2Suv.set(grid.nz, psum_size);
+	P2Suw.set(grid.nz, psum_size);
+	P2Svw.set(grid.nz, psum_size);
+
+#ifdef COMPUTE_XT_AVERAGES
+	Uyz.set(grid.nyz, psum_yz_size);
+	Vyz.set(grid.nyz, psum_yz_size);
+	Wyz.set(grid.nyz, psum_yz_size);
+
+	U2yz.set(grid.nyz, psum_yz_size);
+	V2yz.set(grid.nyz, psum_yz_size);
+	W2yz.set(grid.nyz, psum_yz_size);
+
+	UVyz.set(grid.nyz, psum_yz_size);
+	UWyz.set(grid.nyz, psum_yz_size);
+	VWyz.set(grid.nyz, psum_yz_size);
+#endif
+
+
+#ifdef STRATIFICATION
+	Tc.set(grid.nz, psum_size);
+	Tsh.set(grid.nz, psum_size);
+
+	T2_c.set(grid.nz, psum_size);
+	T2_w.set(grid.nz, psum_size);
+
+	TU.set(grid.nz, psum_size);
+	TV.set(grid.nz, psum_size);
+	TW.set(grid.nz, psum_size);
+	
+	TU_uw.set(grid.nz, psum_size);
+	TV_vw.set(grid.nz, psum_size);
+
+	TW_uw.set(grid.nz, psum_size);
+	TW_vw.set(grid.nz, psum_size);
+	
+	TP.set(grid.nz, psum_size);
+
+	TW_bottom.set(grid.nz, psum_size);
+	TW_top.set(grid.nz, psum_size);
+	TW_bottom_u.set(grid.nz, psum_size);
+	TW_top_u.set(grid.nz, psum_size);
+	TW_bottom_v.set(grid.nz, psum_size);
+	TW_top_v.set(grid.nz, psum_size);
+	TW_bottom_w.set(grid.nz, psum_size);
+	TW_top_w.set(grid.nz, psum_size);
+
+	TW_adv.set(grid.nz, psum_size);
+
+	T2W.set(grid.nz, psum_size);
+	TUW.set(grid.nz, psum_size);
+	TVW.set(grid.nz, psum_size);
+	TWW.set(grid.nz, psum_size);
+
+	T_diss.set(grid.nz, psum_size);
+	TU_diss.set(grid.nz, psum_size);
+	TV_diss.set(grid.nz, psum_size);
+	TW_diss.set(grid.nz, psum_size);
+
+	T_iso_diss.set(grid.nz, psum_size);
+
+	T_dPdx.set(grid.nz, psum_size);
+	T_dPdy.set(grid.nz, psum_size);
+	T_dPdz.set(grid.nz, psum_size);
+
+#ifdef COMPUTE_XT_AVERAGES
+	Tyz.set(grid.nyz, psum_yz_size);
+	T2yz.set(grid.nyz, psum_yz_size);
+	TWyz.set(grid.nyz, psum_yz_size);
+#endif
+#endif
+}
+// ------------------------------------------------------------------------------------------------ //
+
+
+template< typename T >
+void statsData::get_averages(nseAvgVec< T >& avg) const
+{
+	U.average(avg.U); V.average(avg.V); W.average(avg.W);
+	P.average(avg.P);
+
+	U2_u.average(avg.U2_u); V2_v.average(avg.V2_v); W2_w.average(avg.W2_w);
+	U2_uw.average(avg.U2_uw); V2_vw.average(avg.V2_vw); W2_c.average(avg.W2_c);
+
+	W2_u.average(avg.W2_u); W2_v.average(avg.W2_v);
+	W2_uw.average(avg.W2_uw); W2_vw.average(avg.W2_vw);
+
+	UV.average(avg.UV); UW.average(avg.UW); VW.average(avg.VW);
+	UV_uvw.average(avg.UV_uvw);
+	UW_uvw.average(avg.UW_uvw);
+	VW_uvw.average(avg.VW_uvw);
+	PU.average(avg.PU); PV.average(avg.PV); PW.average(avg.PW);
+
+	UW_bottom.average(avg.UW_bottom); UW_top.average(avg.UW_top);
+	VW_bottom.average(avg.VW_bottom); VW_top.average(avg.VW_top);
+	UW_bottom_uv.average(avg.UW_bottom_uv); UW_top_uv.average(avg.UW_top_uv);
+	VW_bottom_uv.average(avg.VW_bottom_uv); VW_top_uv.average(avg.VW_top_uv);
+	UW_bottom_uw.average(avg.UW_bottom_uw); UW_top_uw.average(avg.UW_top_uw);
+	VW_bottom_vw.average(avg.VW_bottom_vw); VW_top_vw.average(avg.VW_top_vw);
+	UW_adv.average(avg.UW_adv); VW_adv.average(avg.VW_adv);
+	
+	U2W.average(avg.U2W); V2W.average(avg.V2W); W2W.average(avg.W2W);
+	UVW.average(avg.UVW); UWW.average(avg.UWW); VWW.average(avg.VWW);
+
+	U_diss.average(avg.U_diss); 
+	V_diss.average(avg.V_diss);
+	W_diss.average(avg.W_diss);
+	
+	UV_diss.average(avg.UV_diss);
+	UW_diss.average(avg.UW_diss);
+	VW_diss.average(avg.VW_diss);
+
+	U_iso_diss.average(avg.U_iso_diss);
+	V_iso_diss.average(avg.V_iso_diss);
+	W_iso_diss.average(avg.W_iso_diss);
+
+	UV_iso_diss.average(avg.UV_iso_diss);
+	UW_iso_diss.average(avg.UW_iso_diss);
+	VW_iso_diss.average(avg.VW_iso_diss);
+
+	PSuu.average(avg.PSuu);
+	PSvv.average(avg.PSvv);
+	PSww.average(avg.PSww);
+
+	P2Suv.average(avg.P2Suv);
+	P2Suw.average(avg.P2Suw);
+	P2Svw.average(avg.P2Svw);
+
+
+#ifdef STRATIFICATION
+	Tc.average(avg.Tc);
+	Tsh.average(avg.Tsh);
+
+	T2_c.average(avg.T2_c);
+	T2_w.average(avg.T2_w);
+
+	TU.average(avg.TU); TV.average(avg.TV); TW.average(avg.TW);
+	TU_uw.average(avg.TU_uw);
+	TV_vw.average(avg.TV_vw);
+	TW_uw.average(avg.TW_uw);
+	TW_vw.average(avg.TW_vw);
+
+	TP.average(avg.TP);
+
+	TW_bottom.average(avg.TW_bottom); TW_top.average(avg.TW_top);
+	TW_bottom_u.average(avg.TW_bottom_u); TW_top_u.average(avg.TW_top_u);
+	TW_bottom_v.average(avg.TW_bottom_v); TW_top_v.average(avg.TW_top_v);
+	TW_bottom_w.average(avg.TW_bottom_w); TW_top_w.average(avg.TW_top_w);
+	TW_adv.average(avg.TW_adv);
+
+	T2W.average(avg.T2W);
+	TUW.average(avg.TUW); TVW.average(avg.TVW); TWW.average(avg.TWW);
+	
+	T_diss.average(avg.T_diss);
+	TU_diss.average(avg.TU_diss); 
+	TV_diss.average(avg.TV_diss);
+	TW_diss.average(avg.TW_diss);
+	
+	T_iso_diss.average(avg.T_iso_diss);
+
+	T_dPdx.average(avg.T_dPdx);
+	T_dPdy.average(avg.T_dPdy);
+	T_dPdz.average(avg.T_dPdz);
+#endif
+
+#ifdef COMPUTE_XT_AVERAGES
+	Uyz.average(avg.Uyz); Vyz.average(avg.Vyz); Wyz.average(avg.Wyz);
+	U2yz.average(avg.U2yz); V2yz.average(avg.V2yz); W2yz.average(avg.W2yz);
+	UVyz.average(avg.UVyz); UWyz.average(avg.UWyz); VWyz.average(avg.VWyz);
+
+#ifdef STRATIFICATION
+	Tyz.average(avg.Tyz);
+	T2yz.average(avg.T2yz);
+	TWyz.average(avg.TWyz);
+#endif
+#endif
+}
+// ------------------------------------------------------------------------------------------------ //
+
+template< typename T >
+void statsData::get_averages(nseAvgVec<T>& avg,
+	const T begin_mark, const T end_mark) const
+{
+	U.average(avg.U, begin_mark, end_mark);
+	V.average(avg.V, begin_mark, end_mark);
+	W.average(avg.W, begin_mark, end_mark);
+	P.average(avg.P, begin_mark, end_mark);
+
+	U2_u.average(avg.U2_u, begin_mark, end_mark);
+	V2_v.average(avg.V2_v, begin_mark, end_mark);
+	W2_w.average(avg.W2_w, begin_mark, end_mark);
+	U2_uw.average(avg.U2_uw, begin_mark, end_mark);
+	V2_vw.average(avg.V2_vw, begin_mark, end_mark);
+	W2_c.average(avg.W2_c, begin_mark, end_mark);
+
+	W2_u.average(avg.W2_u, begin_mark, end_mark);
+	W2_v.average(avg.W2_v, begin_mark, end_mark);
+	W2_uw.average(avg.W2_uw, begin_mark, end_mark);
+	W2_vw.average(avg.W2_vw, begin_mark, end_mark);
+
+	UV.average(avg.UV, begin_mark, end_mark);
+	UW.average(avg.UW, begin_mark, end_mark);
+	VW.average(avg.VW, begin_mark, end_mark);
+	UV_uvw.average(avg.UV_uvw, begin_mark, end_mark);
+	UW_uvw.average(avg.UW_uvw, begin_mark, end_mark);
+	VW_uvw.average(avg.VW_uvw, begin_mark, end_mark);
+
+	PU.average(avg.PU, begin_mark, end_mark);
+	PV.average(avg.PV, begin_mark, end_mark);
+	PW.average(avg.PW, begin_mark, end_mark);
+
+	UW_bottom.average(avg.UW_bottom, begin_mark, end_mark);
+	UW_top.average(avg.UW_top, begin_mark, end_mark);
+	VW_bottom.average(avg.VW_bottom, begin_mark, end_mark);
+	VW_top.average(avg.VW_top, begin_mark, end_mark);
+	UW_bottom_uv.average(avg.UW_bottom_uv, begin_mark, end_mark);
+	UW_top_uv.average(avg.UW_top_uv, begin_mark, end_mark);
+	VW_bottom_uv.average(avg.VW_bottom_uv, begin_mark, end_mark);
+	VW_top_uv.average(avg.VW_top_uv, begin_mark, end_mark);
+	UW_bottom_uw.average(avg.UW_bottom_uw, begin_mark, end_mark);
+	UW_top_uw.average(avg.UW_top_uw, begin_mark, end_mark);
+	VW_bottom_vw.average(avg.VW_bottom_vw, begin_mark, end_mark);
+	VW_top_vw.average(avg.VW_top_vw, begin_mark, end_mark);
+	UW_adv.average(avg.UW_adv, begin_mark, end_mark);
+	VW_adv.average(avg.VW_adv, begin_mark, end_mark);
+	
+	U2W.average(avg.U2W, begin_mark, end_mark);
+	V2W.average(avg.V2W, begin_mark, end_mark);
+	W2W.average(avg.W2W, begin_mark, end_mark);
+
+	UVW.average(avg.UVW, begin_mark, end_mark);
+	UWW.average(avg.UWW, begin_mark, end_mark);
+	VWW.average(avg.VWW, begin_mark, end_mark);
+
+	U_diss.average(avg.U_diss, begin_mark, end_mark);
+	V_diss.average(avg.V_diss, begin_mark, end_mark);
+	W_diss.average(avg.W_diss, begin_mark, end_mark);
+
+	UV_diss.average(avg.UV_diss, begin_mark, end_mark);
+	UW_diss.average(avg.UW_diss, begin_mark, end_mark);
+	VW_diss.average(avg.VW_diss, begin_mark, end_mark);
+	
+	U_iso_diss.average(avg.U_iso_diss, begin_mark, end_mark);
+	V_iso_diss.average(avg.V_iso_diss, begin_mark, end_mark);
+	W_iso_diss.average(avg.W_iso_diss, begin_mark, end_mark);
+
+	UV_iso_diss.average(avg.UV_iso_diss, begin_mark, end_mark);
+	UW_iso_diss.average(avg.UW_iso_diss, begin_mark, end_mark);
+	VW_iso_diss.average(avg.VW_iso_diss, begin_mark, end_mark);
+
+	PSuu.average(avg.PSuu, begin_mark, end_mark);
+	PSvv.average(avg.PSvv, begin_mark, end_mark);
+	PSww.average(avg.PSww, begin_mark, end_mark);
+
+	P2Suv.average(avg.P2Suv, begin_mark, end_mark);
+	P2Suw.average(avg.P2Suw, begin_mark, end_mark);
+	P2Svw.average(avg.P2Svw, begin_mark, end_mark);
+
+
+#ifdef STRATIFICATION
+	Tc.average(avg.Tc, begin_mark, end_mark);
+	Tsh.average(avg.Tsh, begin_mark, end_mark);
+
+	T2_c.average(avg.T2_c, begin_mark, end_mark);
+	T2_w.average(avg.T2_w, begin_mark, end_mark);
+
+	TU.average(avg.TU, begin_mark, end_mark);
+	TV.average(avg.TV, begin_mark, end_mark);
+	TW.average(avg.TW, begin_mark, end_mark);
+
+	TU_uw.average(avg.TU_uw, begin_mark, end_mark);
+	TV_vw.average(avg.TV_vw, begin_mark, end_mark);
+
+	TW_uw.average(avg.TW_uw, begin_mark, end_mark);
+	TW_vw.average(avg.TW_vw, begin_mark, end_mark);
+
+	TP.average(avg.TP, begin_mark, end_mark);
+
+	TW_bottom.average(avg.TW_bottom, begin_mark, end_mark);
+	TW_top.average(avg.TW_top, begin_mark, end_mark);
+	TW_bottom_u.average(avg.TW_bottom_u, begin_mark, end_mark);
+	TW_top_u.average(avg.TW_top_u, begin_mark, end_mark);
+	TW_bottom_v.average(avg.TW_bottom_v, begin_mark, end_mark);
+	TW_top_v.average(avg.TW_top_v, begin_mark, end_mark);
+	TW_bottom_w.average(avg.TW_bottom_w, begin_mark, end_mark);
+	TW_top_w.average(avg.TW_top_w, begin_mark, end_mark);
+
+	TW_adv.average(avg.TW_adv, begin_mark, end_mark);
+
+	T2W.average(avg.T2W, begin_mark, end_mark);
+	TUW.average(avg.TUW, begin_mark, end_mark);
+	TVW.average(avg.TVW, begin_mark, end_mark);
+	TWW.average(avg.TWW, begin_mark, end_mark);
+
+	T_diss.average(avg.T_diss, begin_mark, end_mark);
+	TU_diss.average(avg.TU_diss, begin_mark, end_mark);
+	TV_diss.average(avg.TV_diss, begin_mark, end_mark);
+	TW_diss.average(avg.TW_diss, begin_mark, end_mark);
+
+	T_iso_diss.average(avg.T_iso_diss, begin_mark, end_mark);
+
+	T_dPdx.average(avg.T_dPdx, begin_mark, end_mark);
+	T_dPdy.average(avg.T_dPdy, begin_mark, end_mark);
+	T_dPdz.average(avg.T_dPdz, begin_mark, end_mark);
+#endif
+
+#ifdef COMPUTE_XT_AVERAGES
+	Uyz.average(avg.Uyz, begin_mark, end_mark);
+	Vyz.average(avg.Vyz, begin_mark, end_mark);
+	Wyz.average(avg.Wyz, begin_mark, end_mark);
+	U2yz.average(avg.U2yz, begin_mark, end_mark);
+	V2yz.average(avg.V2yz, begin_mark, end_mark);
+	W2yz.average(avg.W2yz, begin_mark, end_mark);
+	UVyz.average(avg.UVyz, begin_mark, end_mark);
+	UWyz.average(avg.UWyz, begin_mark, end_mark);
+	VWyz.average(avg.VWyz, begin_mark, end_mark);
+
+#ifdef STRATIFICATION
+	Tyz.average(avg.Tyz, begin_mark, end_mark);
+	T2yz.average(avg.T2yz, begin_mark, end_mark);
+	TWyz.average(avg.TWyz, begin_mark, end_mark);
+#endif
+#endif
+}
+// ------------------------------------------------------------------------------------------------ //
+
+
+template< typename T >
+void statsData::read_dump(const int index, const Grid3d< T >& grid)
+{
+	bool status = true;
+
+	if (!read_binary(dump.VELOCITY_PZ_FILE, index, U, V, W, axisZ, grid))
+	{
+		status = false;
+		U.set(grid.nz, psum_size);
+		V.set(grid.nz, psum_size);
+		W.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.PRESSURE_PZ_FILE, index, P, axisZ, grid))
+	{
+		status = false;
+		P.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.UIUI_AT_U_V_C_PZ_FILE, index, U2_u, V2_v, W2_c, axisZ, grid))
+	{
+		status = false;
+		U2_u.set(grid.nz, psum_size);
+		V2_v.set(grid.nz, psum_size);
+		W2_c.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.UIUI_AT_UW_VW_W_PZ_FILE, index, U2_uw, V2_vw, W2_w, axisZ, grid))
+	{
+		status = false;
+		U2_uw.set(grid.nz, psum_size);
+		V2_vw.set(grid.nz, psum_size);
+		W2_w.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.W2_AT_U_UW_PZ_FILE, index, W2_u, W2_uw, axisZ, grid))
+	{
+		status = false;
+		W2_u.set(grid.nz, psum_size);
+		W2_uw.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.W2_AT_V_VW_PZ_FILE, index, W2_v, W2_vw, axisZ, grid))
+	{
+		status = false;
+		W2_v.set(grid.nz, psum_size);
+		W2_vw.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.UIUJ_PZ_FILE, index, UV, UW, VW, axisZ, grid))
+	{
+		status = false;
+		UV.set(grid.nz, psum_size);
+		UW.set(grid.nz, psum_size);
+		VW.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.UIUJ_AT_UVW_PZ_FILE, index, UV_uvw, UW_uvw, VW_uvw, axisZ, grid))
+	{
+		status = false;
+		UV_uvw.set(grid.nz, psum_size);
+		UW_uvw.set(grid.nz, psum_size);
+		VW_uvw.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.PUI_PZ_FILE, index, PU, PV, PW, axisZ, grid))
+	{
+		status = false;
+		PU.set(grid.nz, psum_size);
+		PV.set(grid.nz, psum_size);
+		PW.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.UIW_PARTITION_PZ_FILE, index, 
+		UW_bottom, UW_top, VW_bottom, VW_top, axisZ, grid))
+	{
+		status = false;
+		UW_bottom.set(grid.nz, psum_size); UW_top.set(grid.nz, psum_size);
+		VW_bottom.set(grid.nz, psum_size); VW_top.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.UIW_PARTITION_AT_UV_PZ_FILE, index, 
+		UW_bottom_uv, UW_top_uv, VW_bottom_uv, VW_top_uv, axisZ, grid))
+	{
+		status = false;
+		UW_bottom_uv.set(grid.nz, psum_size); UW_top_uv.set(grid.nz, psum_size);
+		VW_bottom_uv.set(grid.nz, psum_size); VW_top_uv.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.UIW_PARTITION_AT_UIW_PZ_FILE, index,
+		UW_bottom_uw, UW_top_uw, VW_bottom_vw, VW_top_vw, axisZ, grid))
+	{
+		status = false;
+		UW_bottom_uw.set(grid.nz, psum_size); UW_top_uw.set(grid.nz, psum_size);
+		VW_bottom_vw.set(grid.nz, psum_size); VW_top_vw.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.UIW_ADV_PZ_FILE, index, UW_adv, VW_adv, axisZ, grid))
+	{
+		status = false;
+		UW_adv.set(grid.nz, psum_size); VW_adv.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.UIUIW_PZ_FILE, index, U2W, V2W, W2W, axisZ, grid))
+	{
+		status = false;
+		U2W.set(grid.nz, psum_size);
+		V2W.set(grid.nz, psum_size);
+		W2W.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.UIUJW_PZ_FILE, index, UVW, UWW, VWW, axisZ, grid))
+	{
+		status = false;
+		UVW.set(grid.nz, psum_size);
+		UWW.set(grid.nz, psum_size);
+		VWW.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.UIUI_DISSIPATION_PZ_FILE, index,
+		U_diss, V_diss, W_diss, axisZ, grid))
+	{
+		status = false;
+		U_diss.set(grid.nz, psum_size);
+		V_diss.set(grid.nz, psum_size);
+		W_diss.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.UIUJ_DISSIPATION_PZ_FILE, index, 
+		UV_diss, UW_diss, VW_diss, axisZ, grid))
+	{
+		status = false;
+		UV_diss.set(grid.nz, psum_size);
+		UW_diss.set(grid.nz, psum_size);
+		VW_diss.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.UIUI_ISO_DISSIPATION_PZ_FILE, index,
+		U_iso_diss, V_iso_diss, W_iso_diss, axisZ, grid))
+	{
+		status = false;
+		U_iso_diss.set(grid.nz, psum_size);
+		V_iso_diss.set(grid.nz, psum_size);
+		W_iso_diss.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.UIUJ_ISO_DISSIPATION_PZ_FILE, index, 
+		UV_iso_diss, UW_iso_diss, VW_iso_diss, axisZ, grid))
+	{
+		status = false;
+		UV_iso_diss.set(grid.nz, psum_size);
+		UW_iso_diss.set(grid.nz, psum_size);
+		VW_iso_diss.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.P_STRAIN_II_PZ_FILE, index, PSuu, PSvv, PSww, axisZ, grid))
+	{
+		status = false;
+		PSuu.set(grid.nz, psum_size);
+		PSvv.set(grid.nz, psum_size);
+		PSww.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.P_2STRAIN_IJ_PZ_FILE, index, P2Suv, P2Suw, P2Svw, axisZ, grid))
+	{
+		status = false;
+		P2Suv.set(grid.nz, psum_size);
+		P2Suw.set(grid.nz, psum_size);
+		P2Svw.set(grid.nz, psum_size);
+	}
+
+#ifdef COMPUTE_XT_AVERAGES
+	if (!read_binary(dump.VELOCITY_PYZ_FILE, index, Uyz, Vyz, Wyz, axisYZ, grid))
+	{
+		status = false;
+		Uyz.set(grid.nyz, psum_yz_size);
+		Vyz.set(grid.nyz, psum_yz_size);
+		Wyz.set(grid.nyz, psum_yz_size);
+	}
+
+	if (!read_binary(dump.UIUI_PYZ_FILE, index, U2yz, V2yz, W2yz, axisYZ, grid))
+	{
+		status = false;
+		U2yz.set(grid.nyz, psum_yz_size);
+		V2yz.set(grid.nyz, psum_yz_size);
+		W2yz.set(grid.nyz, psum_yz_size);
+	}
+	if (!read_binary(dump.UIUJ_PYZ_FILE, index, UVyz, UWyz, VWyz, axisYZ, grid))
+	{
+		status = false;
+		UVyz.set(grid.nyz, psum_yz_size);
+		UWyz.set(grid.nyz, psum_yz_size);
+		VWyz.set(grid.nyz, psum_yz_size);
+	}
+#endif
+
+
+#ifdef STRATIFICATION
+	if (!read_binary(dump.TEMPERATURE_PZ_FILE, index, Tc, axisZ, grid))
+	{
+		status = false;
+		Tc.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.TEMPERATURE_SHIFT_PZ_FILE, index, Tsh, axisZ, grid))
+	{
+		status = false;
+		Tsh.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.T2_AT_C_W_PZ_FILE, index, T2_c, T2_w, axisZ, grid))
+	{
+		status = false;
+		T2_c.set(grid.nz, psum_size);
+		T2_w.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.TUI_PZ_FILE, index, TU, TV, TW, axisZ, grid))
+	{
+		status = false;
+		TU.set(grid.nz, psum_size);
+		TV.set(grid.nz, psum_size);
+		TW.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.TU_TW_AT_UW_PZ_FILE, index, TU_uw, TW_uw, axisZ, grid))
+	{
+		status = false;
+		TU_uw.set(grid.nz, psum_size);
+		TW_uw.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.TV_TW_AT_VW_PZ_FILE, index, TV_vw, TW_vw, axisZ, grid))
+	{
+		status = false;
+		TV_vw.set(grid.nz, psum_size);
+		TW_vw.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.TP_PZ_FILE, index, TP, axisZ, grid))
+	{
+		status = false;
+		TP.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.TW_PARTITION_PZ_FILE, index, 
+		TW_bottom, TW_top, axisZ, grid))
+	{
+		status = false;
+		TW_bottom.set(grid.nz, psum_size); TW_top.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.TW_PARTITION_AT_U_PZ_FILE, index,
+		TW_bottom_u, TW_top_u, axisZ, grid))
+	{
+		status = false;
+		TW_bottom_u.set(grid.nz, psum_size); TW_top_u.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.TW_PARTITION_AT_V_PZ_FILE, index,
+		TW_bottom_v, TW_top_v, axisZ, grid))
+	{
+		status = false;
+		TW_bottom_v.set(grid.nz, psum_size); TW_top_v.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.TW_PARTITION_AT_W_PZ_FILE, index,
+		TW_bottom_w, TW_top_w, axisZ, grid))
+	{
+		status = false;
+		TW_bottom_w.set(grid.nz, psum_size); TW_top_w.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.TW_ADV_PZ_FILE, index, TW_adv, axisZ, grid))
+	{
+		status = false;
+		TW_adv.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.T2W_PZ_FILE, index, T2W, axisZ, grid))
+	{
+		status = false;
+		T2W.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.TUIW_PZ_FILE, index, TUW, TVW, TWW, axisZ, grid))
+	{
+		status = false;
+		TUW.set(grid.nz, psum_size);
+		TVW.set(grid.nz, psum_size);
+		TWW.set(grid.nz, psum_size);
+	}
+	
+	if (!read_binary(dump.T_DISSIPATION_PZ_FILE, index, T_diss, axisZ, grid))
+	{
+		status = false;
+		T_diss.set(grid.nz, psum_size);
+	}
+	if (!read_binary(dump.TUI_DISSIPATION_PZ_FILE, index, 
+		TU_diss, TV_diss, TW_diss, axisZ, grid))
+	{
+		status = false;
+		TU_diss.set(grid.nz, psum_size);
+		TV_diss.set(grid.nz, psum_size);
+		TW_diss.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.T_ISO_DISSIPATION_PZ_FILE, index, T_iso_diss, axisZ, grid))
+	{
+		status = false;
+		T_iso_diss.set(grid.nz, psum_size);
+	}
+
+	if (!read_binary(dump.T_GRADP_PZ_FILE, index, T_dPdx, T_dPdy, T_dPdz, axisZ, grid))
+	{
+		status = false;
+		T_dPdx.set(grid.nz, psum_size);
+		T_dPdy.set(grid.nz, psum_size);
+		T_dPdz.set(grid.nz, psum_size);
+	}
+
+#ifdef COMPUTE_XT_AVERAGES
+	if (!read_binary(dump.TEMPERATURE_PYZ_FILE, index, Tyz, axisYZ, grid))
+	{
+		status = false;
+		Tyz.set(grid.nyz, psum_yz_size);
+	}
+
+	if (!read_binary(dump.T2_PYZ_FILE, index, T2yz, axisYZ, grid))
+	{
+		status = false;
+		T2yz.set(grid.nyz, psum_yz_size);
+	}
+	if (!read_binary(dump.TW_PYZ_FILE, index, TWyz, axisYZ, grid))
+	{
+		status = false;
+		TWyz.set(grid.nyz, psum_yz_size);
+	}
+#endif
+#endif
+
+	if ((!status) && (grid.mpi_com.rank == 0))
+	{
+		printf("\n >> WARNING! >> ** failed to read dump files for some statistics **\n");
+		printf("\t >> -- setting new statistics data\n\n");
+	}
+}
+// ------------------------------------------------------------------------------------------------ //
+
+template< typename T >
+void statsData::write_dump(const int index,
+	const T current_time, const Grid3d< T >& grid) const
+{
+#ifndef RESTRICT_STATS_DUMP
+	bool status;
+
+	status = write_binary(dump.VELOCITY_PZ_FILE, index,
+		U, V, W, "U", "V", "W", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.VELOCITY_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.PRESSURE_PZ_FILE, index,
+		P, "P", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.PRESSURE_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.UIUI_AT_U_V_C_PZ_FILE, index,
+		U2_u, V2_v, W2_c, "U^2 [u]", "V^2 [v]", "W^2 [c]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUI_AT_U_V_C_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.UIUI_AT_UW_VW_W_PZ_FILE, index,
+		U2_uw, V2_vw, W2_w, "U^2 [uw]", "V^2 [vw]", "W^2 [w]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUI_AT_UW_VW_W_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.W2_AT_U_UW_PZ_FILE, index,
+		W2_u, W2_uw, "W^2 [u]", "W^2 [uw]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.W2_AT_U_UW_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.W2_AT_V_VW_PZ_FILE, index,
+		W2_v, W2_vw, "W^2 [v]", "W^2 [vw]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.W2_AT_V_VW_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.UIUJ_PZ_FILE, index,
+		UV, UW, VW, "UV", "UW", "VW", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUJ_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.UIUJ_AT_UVW_PZ_FILE, index,
+		UV_uvw, UW_uvw, VW_uvw, "UV [uvw]", "UW [uvw]", "VW [uvw]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUJ_AT_UVW_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.PUI_PZ_FILE, index,
+		PU, PV, PW, "PU", "PV", "PW", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.PUI_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.UIW_PARTITION_PZ_FILE, index, 
+		UW_bottom, UW_top, VW_bottom, VW_top, 
+		"UW-bottom", "UW-top", "VW-bottom", "VW-top", 
+		axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIW_PARTITION_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.UIW_PARTITION_AT_UV_PZ_FILE, index,
+		UW_bottom_uv, UW_top_uv, VW_bottom_uv, VW_top_uv, 
+		"UW-bottom [uv]", "UW-top [uv]", "VW-bottom [uv]", "VW-top [uv]", 
+		axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIW_PARTITION_AT_UV_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.UIW_PARTITION_AT_UIW_PZ_FILE, index,
+		UW_bottom_uw, UW_top_uw, VW_bottom_vw, VW_top_vw,
+		"UW-bottom [uw]", "UW-top [uw]", "VW-bottom [vw]", "VW-top [vw]",
+		axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIW_PARTITION_AT_UIW_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.UIW_ADV_PZ_FILE, index,
+		UW_adv, VW_adv, "UW-adv", "VW-adv", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIW_ADV_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.UIUIW_PZ_FILE, index,
+		U2W, V2W, W2W, "U^2W", "V^2W", "W^2W", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUIW_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.UIUJW_PZ_FILE, index,
+		UVW, UWW, VWW, "UVW", "UWW", "VWW", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUJW_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.UIUI_DISSIPATION_PZ_FILE, index,
+		U_diss, V_diss, W_diss, 
+		"U-diss", "V-diss", "W-diss", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUI_DISSIPATION_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.UIUJ_DISSIPATION_PZ_FILE, index,
+		UV_diss, UW_diss, VW_diss, 
+		"UV-diss", "UW-diss", "VW-diss", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUJ_DISSIPATION_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.UIUI_ISO_DISSIPATION_PZ_FILE, index,
+		U_iso_diss, V_iso_diss, W_iso_diss,
+		"U-iso-diss", "V-iso-diss", "W-iso-diss", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUI_ISO_DISSIPATION_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.UIUJ_ISO_DISSIPATION_PZ_FILE, index,
+		UV_iso_diss, UW_iso_diss, VW_iso_diss, 
+		"UV-iso-diss", "UW-iso-diss", "VW-iso-diss", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUJ_ISO_DISSIPATION_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.P_STRAIN_II_PZ_FILE, index,
+		PSuu, PSvv, PSww, "PSuu", "PSvv", "PSww", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.P_STRAIN_II_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.P_2STRAIN_IJ_PZ_FILE, index,
+		P2Suv, P2Suw, P2Svw, "P2Suv", "P2Suw", "P2Svw", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.P_2STRAIN_IJ_PZ_FILE.c_str());
+	}
+
+#ifdef COMPUTE_XT_AVERAGES
+	status = write_binary(dump.VELOCITY_PYZ_FILE, index,
+		Uyz, Vyz, Wyz, "U", "V", "W", axisYZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.VELOCITY_PYZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.UIUI_PYZ_FILE, index,
+		U2yz, V2yz, W2yz, "U^2", "V^2", "W^2", axisYZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUI_PYZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.UIUJ_PYZ_FILE, index,
+		UVyz, UWyz, VWyz, "UV", "UW", "VW", axisYZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.UIUJ_PYZ_FILE.c_str());
+	}
+#endif
+
+
+#ifdef STRATIFICATION
+	status = write_binary(dump.TEMPERATURE_PZ_FILE, index,
+		Tc, "T", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TEMPERATURE_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.TEMPERATURE_SHIFT_PZ_FILE, index,
+		Tsh, "T-shift", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TEMPERATURE_SHIFT_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.T2_AT_C_W_PZ_FILE, index,
+		T2_c, T2_w, "T^2 [c]", "T^2 [w]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.T2_AT_C_W_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.TUI_PZ_FILE, index,
+		TU, TV, TW, "TU", "TV", "TW", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TUI_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.TU_TW_AT_UW_PZ_FILE, index,
+		TU_uw, TW_uw, "TU [uw]", "TW [uw]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TU_TW_AT_UW_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.TV_TW_AT_VW_PZ_FILE, index,
+		TV_vw, TW_vw, "TV [vw]", "TW [vw]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TV_TW_AT_VW_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.TP_PZ_FILE, index, 
+		TP, "TP", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TP_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.TW_PARTITION_PZ_FILE, index,
+		TW_bottom, TW_top, "TW-bottom", "TW-top", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TW_PARTITION_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.TW_PARTITION_AT_U_PZ_FILE, index,
+		TW_bottom_u, TW_top_u, "TW-bottom [u]", "TW-top [u]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TW_PARTITION_AT_U_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.TW_PARTITION_AT_V_PZ_FILE, index,
+		TW_bottom_v, TW_top_v, "TW-bottom [v]", "TW-top [v]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TW_PARTITION_AT_V_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.TW_PARTITION_AT_W_PZ_FILE, index,
+		TW_bottom_w, TW_top_w, "TW-bottom [w]", "TW-top [w]", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TW_PARTITION_AT_W_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.TW_ADV_PZ_FILE, index,
+		TW_adv, "TW-adv", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TW_ADV_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.T2W_PZ_FILE, index,
+		T2W, "T^2W", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.T2W_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.TUIW_PZ_FILE, index,
+		TUW, TVW, TWW, "TUW", "TVW", "TWW", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TUIW_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.T_DISSIPATION_PZ_FILE, index,
+		T_diss, "T-diss", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.T_DISSIPATION_PZ_FILE.c_str());
+	}
+	status = write_binary(dump.TUI_DISSIPATION_PZ_FILE, index,
+		TU_diss, TV_diss, TW_diss, 
+		"TU-diss", "TV-diss", "TW-diss", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TUI_DISSIPATION_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.T_ISO_DISSIPATION_PZ_FILE, index,
+		T_iso_diss, "T-iso-diss", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.T_ISO_DISSIPATION_PZ_FILE.c_str());
+	}
+
+	status = write_binary(dump.T_GRADP_PZ_FILE, index,
+		T_dPdx, T_dPdy, T_dPdz, "T*dP/dx", "T*dP/dy", "T*dP/dz", axisZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.T_GRADP_PZ_FILE.c_str());
+	}
+
+#ifdef COMPUTE_XT_AVERAGES
+	status = write_binary(dump.TEMPERATURE_PYZ_FILE, index,
+		Tyz, "T", axisYZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TEMPERATURE_PYZ_FILE.c_str());
+	}
+	status = write_binary(dump.T2_PYZ_FILE, index,
+		T2yz, "T^2", axisYZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.T2_PYZ_FILE.c_str());
+	}
+	status = write_binary(dump.TW_PYZ_FILE, index,
+		TWyz, "TW", axisYZ, grid, current_time);
+	if ((!status) && (grid.mpi_com.rank == 0)) {
+		printf("\n >> WARNING! >> ** failed to write stat dump[%i]: ""%s""\n",
+			index, dump.TW_PYZ_FILE.c_str());
+	}
+#endif
+#endif
+
+#endif
+}
+// ------------------------------------------------------------------------------------------------ //
+
+template void statsData::init(const Grid3d< float >&);
+template void statsData::init(const Grid3d< double >&);
+// ------------------------------------------------------------------------------------------------ //
+
+template void statsData::get_averages(nseAvgVec< float >&) const;
+template void statsData::get_averages(nseAvgVec< double >&) const;
+// ------------------------------------------------------------------------------------------------ //
+
+template void statsData::get_averages(nseAvgVec< float >&,
+	const float, const float) const;
+template void statsData::get_averages(nseAvgVec< double >&,
+	const double, const double) const;
+// ------------------------------------------------------------------------------------------------ //
+
+template void statsData::read_dump(const int, const Grid3d< float >&);
+template void statsData::read_dump(const int, const Grid3d< double >&);
+// ------------------------------------------------------------------------------------------------ //
+
+template void statsData::write_dump(const int index,
+	const float current_time, const Grid3d< float >& grid) const;
+template void statsData::write_dump(const int index,
+	const double current_time, const Grid3d< double >& grid) const;
+// ------------------------------------------------------------------------------------------------ //
diff --git a/stats-data.h b/stats-data.h
new file mode 100644
index 0000000000000000000000000000000000000000..b83f0631367f2ad39d972a72703ed8fd83236be0
--- /dev/null
+++ b/stats-data.h
@@ -0,0 +1,130 @@
+#pragma once
+
+// [stats-data.h]: statistics data block
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+
+#include "grid3d.h"
+#include "time-slice.h"
+
+#include "stats-output.h"
+#include "stats-dump.h"
+
+#include "nse-avg-vec.h"
+
+using namespace nse;
+
+
+struct statsData
+{
+	// ------------------------------------------------------------------------------------------------ //
+	static const int psum_size = 160;		// partial sum size for -z time-slices
+#ifdef COMPUTE_XT_AVERAGES
+	static const int psum_yz_size = 160;	// partial sum size for -yz time-slices
+#endif
+
+
+	timeSlice U, V, W, P;					// velocity-pressure averages: [C, C, W, C]
+	
+	timeSlice U2_u, V2_v, W2_w;				// Ui^2: [C, C, W]
+	timeSlice U2_uw, V2_vw, W2_c;			// Ui^2: [W, W, C]
+	timeSlice W2_u, W2_v, W2_uw, W2_vw;		// W^2: [C, C, W, W]
+	
+	timeSlice UV, UW, VW;					// Ui*Uj: [C, W, W]
+	timeSlice UV_uvw, UW_uvw, VW_uvw;		// Ui*Uj: [W, W, W]
+	timeSlice PU, PV, PW;					// P*Ui: [C, C, W]
+	
+	timeSlice UW_bottom, UW_top;			// special approximations to U*W: [C (W -- C), C (W -- C)]
+	timeSlice VW_bottom, VW_top;			// special approximations to V*W: [C (W -- C), C (W -- C)]
+	timeSlice UW_bottom_uv, UW_top_uv;		// special approximations to U*W: [C (W -- C), C (W -- C)]
+	timeSlice VW_bottom_uv, VW_top_uv;		// special approximations to V*W: [C (W -- C), C (W -- C)]
+	timeSlice UW_bottom_uw, UW_top_uw;		// special approximations to U*W: [W (C -- W), W (C -- W)]
+	timeSlice VW_bottom_vw, VW_top_vw;		// special approximations to V*W: [W (C -- W), W (C -- W)]
+	timeSlice UW_adv, VW_adv;				// W*dU/dz, W*dV/dz: [W, W] 
+
+	timeSlice U2W, V2W, W2W;				// Ui*Ui*W: [W, W, C]
+	timeSlice UVW, UWW, VWW;				// Ui*Uj*W: [W, C, C]
+
+	timeSlice U_diss, V_diss, W_diss;		// Ui*div(grad(Ui)): [C, C, W]
+	timeSlice UV_diss, UW_diss, VW_diss;	// Ui*Uj dissipation: [C, W, W]
+
+	timeSlice U_iso_diss, V_iso_diss, W_iso_diss;		// grad(Ui)*grad(Ui): [C, C, W]
+	timeSlice UV_iso_diss, UW_iso_diss, VW_iso_diss;	// Ui*Uj iso-dissipation: [C, W, W]
+
+	timeSlice PSuu, PSvv, PSww;				// P*dUi/dxi = P*Sii: [C, C, C]
+	timeSlice P2Suv, P2Suw, P2Svw;			// P*2*Sij: [C, W, W]
+
+#ifdef COMPUTE_XT_AVERAGES
+	timeSlice Uyz, Vyz, Wyz;		// velocity -x, -time averages: [C, V, W]
+	timeSlice U2yz, V2yz, W2yz;		// (y,z) squares: [C, V, W]
+	timeSlice UVyz, UWyz, VWyz;		// products(2nd order): [V, W, VW]
+#endif
+
+
+#ifdef STRATIFICATION
+	timeSlice Tc;					// average: [C]
+	timeSlice Tsh;					// average with removed linear profile: [C]
+									// using to extend precision in calculation of dissipation
+
+	timeSlice T2_c, T2_w;			// T*T: [C, W]
+
+	timeSlice TU, TV, TW;			// T*Ui: [C, C, W]
+	timeSlice TU_uw, TV_vw;			// T*U, T*V: [W, W]
+	timeSlice TW_uw, TW_vw;			// T*W: [W, W]
+	timeSlice TP;					// T*P: [C]
+
+	timeSlice TW_bottom, TW_top,	// special approximations to T*W: [C (W -- C), C (W -- C)]
+		TW_bottom_u, TW_top_u,		// special approximations to T*W: [C (W -- C), C (W -- C)]
+		TW_bottom_v, TW_top_v,		// special approximations to T*W: [C (W -- C), C (W -- C)]
+		TW_bottom_w, TW_top_w;		// special approximations to T*W: [W (C -- W), W (C -- W)]
+	timeSlice TW_adv;				// W*dT/dz: [W]
+
+	timeSlice T2W;					// T*T*W: [W]
+	timeSlice TUW, TVW, TWW;		// T*Ui*W: [W, W, C]
+
+	timeSlice T_diss;						// T*div(grad(T)): [C]
+	timeSlice TU_diss, TV_diss, TW_diss;	// T*Ui dissipation: [C, C, W]
+
+	timeSlice T_iso_diss;					// grad(T)*grad(T): [C]
+
+	timeSlice T_dPdx, T_dPdy, T_dPdz;		// T*grad(P): [C, C, W]
+
+#ifdef COMPUTE_XT_AVERAGES
+	timeSlice Tyz;					// temperature -x, -time average: [C]
+	timeSlice T2yz;					// square: [C]
+	timeSlice TWyz;					// products: [W]
+#endif
+#endif
+
+
+	Real begin, dt;
+	int time_mod,					// stats gather at mod steps //
+		time_index;
+
+	statsOutput output;
+	statsDump dump;
+	// ------------------------------------------------------------------------------------------------ //
+
+
+	template< typename T >
+	void init(const Grid3d< T >& grid);
+
+	template< typename T >
+	void get_averages(nseAvgVec< T >& avg) const;
+	template< typename T >
+	void get_averages(nseAvgVec< T >& avg,
+		const T begin_mark, const T end_mark) const;
+
+	template< typename T >
+	void read_dump(const int index,
+		const Grid3d< T >& grid);
+	template< typename T >
+	void write_dump(const int index,
+		const T current_time, const Grid3d< T >& grid) const;
+
+	statsData();
+	~statsData();
+};
+// ------------------------------------------------------------------------------------------------ //
diff --git a/stats-dump.h b/stats-dump.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cf6d158168a2ae7df7795bef7febddac8bf242f
--- /dev/null
+++ b/stats-dump.h
@@ -0,0 +1,193 @@
+#pragma once
+
+// [stats-dump.h]: dump data structure for statistics
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+#include "str-com.h"
+
+#include <string>
+
+
+struct statsDump
+{
+	std::string DIR;
+
+
+	// ------------------------------------------------------------------------------------------------ //
+	std::string VELOCITY_PZ_FILE, PRESSURE_PZ_FILE;
+
+	std::string UIUI_AT_U_V_C_PZ_FILE;
+	std::string	UIUI_AT_UW_VW_W_PZ_FILE;
+
+	std::string W2_AT_U_UW_PZ_FILE;
+	std::string	W2_AT_V_VW_PZ_FILE;
+
+	std::string UIUJ_PZ_FILE;
+	std::string	UIUJ_AT_UVW_PZ_FILE;
+
+	std::string PUI_PZ_FILE;
+
+	std::string UIW_PARTITION_PZ_FILE;
+	std::string	UIW_PARTITION_AT_UV_PZ_FILE;
+	std::string	UIW_PARTITION_AT_UIW_PZ_FILE;
+
+	std::string UIW_ADV_PZ_FILE;
+
+	std::string UIUIW_PZ_FILE, UIUJW_PZ_FILE;
+
+	std::string UIUI_DISSIPATION_PZ_FILE;
+	std::string	UIUJ_DISSIPATION_PZ_FILE;
+	
+	std::string	UIUI_ISO_DISSIPATION_PZ_FILE;
+	std::string UIUJ_ISO_DISSIPATION_PZ_FILE;
+
+	std::string P_STRAIN_II_PZ_FILE;
+	std::string P_2STRAIN_IJ_PZ_FILE;
+
+#ifdef COMPUTE_XT_AVERAGES
+	std::string VELOCITY_PYZ_FILE;
+
+	std::string UIUI_PYZ_FILE;
+	std::string UIUJ_PYZ_FILE;
+#endif
+
+
+#ifdef STRATIFICATION
+	std::string TEMPERATURE_PZ_FILE;
+	std::string TEMPERATURE_SHIFT_PZ_FILE;
+
+	std::string T2_AT_C_W_PZ_FILE;
+
+	std::string TUI_PZ_FILE;
+	std::string TU_TW_AT_UW_PZ_FILE;
+	std::string TV_TW_AT_VW_PZ_FILE;
+
+	std::string TP_PZ_FILE;
+
+	std::string TW_PARTITION_PZ_FILE;
+	std::string TW_PARTITION_AT_U_PZ_FILE;
+	std::string TW_PARTITION_AT_V_PZ_FILE;
+	std::string TW_PARTITION_AT_W_PZ_FILE;
+
+	std::string TW_ADV_PZ_FILE;
+	
+	std::string T2W_PZ_FILE;
+	std::string TUIW_PZ_FILE;
+
+	std::string T_DISSIPATION_PZ_FILE;
+	std::string TUI_DISSIPATION_PZ_FILE;
+	
+	std::string	T_ISO_DISSIPATION_PZ_FILE;
+
+	std::string T_GRADP_PZ_FILE;
+
+#ifdef COMPUTE_XT_AVERAGES
+	std::string TEMPERATURE_PYZ_FILE;
+
+	std::string T2_PYZ_FILE;
+	std::string TW_PYZ_FILE;
+#endif
+#endif
+	// ------------------------------------------------------------------------------------------------ //
+
+
+	bool set_filenames(const std::string& _DIR);
+
+	statsDump();
+	~statsDump();
+};
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+inline statsDump::statsDump() {}
+inline statsDump::~statsDump() {}
+// -------------------------------------------------------------------------------------------- //
+
+inline bool statsDump::set_filenames(const std::string& _DIR)
+{
+	if (!nse::create_dir(_DIR)) return false;
+	DIR = _DIR;
+
+	// ------------------------------------------------------------------------------------------------ //
+	VELOCITY_PZ_FILE = DIR + "Ui-stat(z)-.slx";
+	PRESSURE_PZ_FILE = DIR + "P-stat(z)-.slx";
+
+	UIUI_AT_U_V_C_PZ_FILE = DIR + "UiUi-at-[u-v-c]-stat(z)-.slx";
+	UIUI_AT_UW_VW_W_PZ_FILE = DIR + "UiUi-at-[uw-vw-w]-stat(z)-.slx";
+
+	W2_AT_U_UW_PZ_FILE = DIR + "W2-at-[u-uw]-stat(z)-.slx";
+	W2_AT_V_VW_PZ_FILE = DIR + "W2-at-[v-vw]-stat(z)-.slx";
+
+	UIUJ_PZ_FILE = DIR + "UiUj-stat(z)-.slx";
+	UIUJ_AT_UVW_PZ_FILE = DIR + "UiUj-at-[uvw]-stat(z)-.slx";
+	
+	PUI_PZ_FILE = DIR + "PUi-stat(z)-.slx";
+
+	UIW_PARTITION_PZ_FILE = DIR + "UiW-partition-stat(z)-.slx";
+	UIW_PARTITION_AT_UV_PZ_FILE = DIR + "UiW-partition-at-[uv]-stat(z)-.slx";
+	UIW_PARTITION_AT_UIW_PZ_FILE = DIR + "UiW-partition-at-[uiw]-stat(z)-.slx";
+	
+	UIW_ADV_PZ_FILE = DIR + "UiW-adv-stat(z)-.slx";
+
+	UIUIW_PZ_FILE = DIR + "UiUiW-stat(z)-.slx";
+	UIUJW_PZ_FILE = DIR + "UiUjW-stat(z)-.slx";
+
+	UIUI_DISSIPATION_PZ_FILE = DIR + "UiUi-diss-stat(z)-.slx";
+	UIUJ_DISSIPATION_PZ_FILE = DIR + "UiUj-diss-stat(z)-.slx";
+
+	UIUI_ISO_DISSIPATION_PZ_FILE = DIR + "UiUi-iso-diss-stat(z)-.slx";
+	UIUJ_ISO_DISSIPATION_PZ_FILE = DIR + "UiUj-iso-diss-stat(z)-.slx";
+
+	P_STRAIN_II_PZ_FILE = DIR + "PSii-stat(z)-.slx";
+	P_2STRAIN_IJ_PZ_FILE = DIR + "P2Sij-stat(z)-.slx";
+
+#ifdef COMPUTE_XT_AVERAGES
+	VELOCITY_PYZ_FILE = DIR + "Ui-stat(y,z)-.slx";
+	UIUI_PYZ_FILE = DIR + "UiUi-stat(y,z)-.slx";
+	UIUJ_PYZ_FILE = DIR + "UiUj-stat(y,z)-.slx";
+#endif
+
+
+#ifdef STRATIFICATION
+	TEMPERATURE_PZ_FILE = DIR + "T-stat(z)-.slx";
+	TEMPERATURE_SHIFT_PZ_FILE = DIR + "T-shift-stat(z)-.slx";
+
+	T2_AT_C_W_PZ_FILE = DIR + "T2-at-[c-w]-stat(z)-.slx";
+
+	TUI_PZ_FILE = DIR + "TUi-stat(z)-.slx";
+	TU_TW_AT_UW_PZ_FILE = DIR + "TU-TW-at-[uw]-stat(z)-.slx";
+	TV_TW_AT_VW_PZ_FILE = DIR + "TV-TW-at-[vw]-stat(z)-.slx";
+
+	TP_PZ_FILE = DIR + "TP-stat(z)-.slx";
+
+	TW_PARTITION_PZ_FILE = DIR + "TW-partition-stat(z)-.slx";
+	TW_PARTITION_AT_U_PZ_FILE = DIR + "TW-partition-at-[u]-stat(z)-.slx";
+	TW_PARTITION_AT_V_PZ_FILE = DIR + "TW-partition-at-[v]-stat(z)-.slx";
+	TW_PARTITION_AT_W_PZ_FILE = DIR + "TW-partition-at-[w]-stat(z)-.slx";
+
+	TW_ADV_PZ_FILE = DIR + "TW-adv-stat(z)-.slx";
+
+	T2W_PZ_FILE = DIR + "T2W-stat(z)-.slx";
+	TUIW_PZ_FILE = DIR + "TUiW-stat(z)-.slx";
+	
+	T_DISSIPATION_PZ_FILE = DIR + "T-diss-stat(z)-.slx";
+	TUI_DISSIPATION_PZ_FILE = DIR + "TUi-diss-stat(z)-.slx";
+	
+	T_ISO_DISSIPATION_PZ_FILE = DIR + "T-iso-diss-stat(z)-.slx";
+
+	T_GRADP_PZ_FILE = DIR + "T-gradP-stat(z)-.slx";
+
+#ifdef COMPUTE_XT_AVERAGES
+	TEMPERATURE_PYZ_FILE = DIR + "T-stat(y,z)-.slx";
+	T2_PYZ_FILE = DIR + "T2-stat(y,z)-.slx";
+	TW_PYZ_FILE = DIR + "TW-stat(y,z)-.slx";
+#endif
+#endif
+	// ------------------------------------------------------------------------------------------------ //
+
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/stats-output.h b/stats-output.h
new file mode 100644
index 0000000000000000000000000000000000000000..791f368b23aeca95a96044723b5cb12162b34dc8
--- /dev/null
+++ b/stats-output.h
@@ -0,0 +1,314 @@
+#pragma once
+
+// [stats-output.h]: output data structure for statistics
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "model-defines.h"
+#include "str-com.h"
+
+#include <string>
+
+
+struct statsOutput
+{
+	std::string DIR;
+
+
+	// ------------------------------------------------------------------------------------------------ //
+	std::string U_AVERAGE_PZ_FILE,				// U(z): [C]
+		V_AVERAGE_PZ_FILE, W_AVERAGE_PZ_FILE;	// V(z): [C], W(z): [W]
+
+	std::string U_AVERAGE_VISC_PZ_FILE;			// U+(z+): [C]
+
+	std::string U_DEVIATION_PZ_FILE,				// (u'u')^1/2: [C]
+		V_DEVIATION_PZ_FILE, W_DEVIATION_PZ_FILE;	// (v'v')^1/2: [C], (w'w')^1/2: [W]
+
+	std::string UV_FLUX_PZ_FILE,				// u'v'(z): [C]
+		UW_FLUX_PZ_FILE, VW_FLUX_PZ_FILE;		// u'w'(z): [W], v'w'(z): [W]
+
+	std::string U_GRAD_PZ_FILE;					// dU(z)/dz: [W]
+
+
+#ifdef COMPUTE_XT_AVERAGES
+	std::string U_AVERAGE_PYZ_FILE;				// U(y,z): [C]
+	std::string V_W_AVERAGE_PYZ_FILE;			// V(y,z) & W(y,z): [V,W]
+
+	std::string U_DEVIATION_PYZ_FILE;			// (u'u')^1/2: [C]
+	std::string V_W_DEVIATION_PYZ_FILE;			// (v'v')^1/2 & (w'w')^1/2: [V,W]
+
+	std::string UV_FLUX_PYZ_FILE,				// u'v'(y,z): [V]
+		UW_FLUX_PYZ_FILE, VW_FLUX_PYZ_FILE;		// u'w'(y,z): [W], v'w'(y,z): [VW]
+
+	std::string U_GRAD_PYZ_FILE;				// dU(y,z)/dz: [W]
+#endif
+
+
+	std::string MOMENTUM_BALANCE_PZ_FILE;		// -u'w' + (1/Re) * dU/dz = (u*)^2: [W]
+												// momentum flux balance: turbulent momentum flux + viscous stress = (u*)^2
+
+	std::string U_TKE_BALANCE_PZ_FILE,			// TKE components balances: [C, C, C]
+		V_TKE_BALANCE_PZ_FILE, W_TKE_BALANCE_PZ_FILE;
+	
+	std::string TKE_BALANCE_PZ_FILE;			// TKE stationary balance: [C]
+												// production + diffusion + dissipation + [heat flux] = 0
+												// includes: 
+												//   iso-dissipation 
+												//   diffusion terms (transport and pressure work)
+	
+
+	std::string ENERGY_STRUCTURE_PZ_FILE;		// TKE, 1/2*u'[i]*u'[i], TPE & shares: [C]	
+
+	std::string TKE_ANISOTROPY_PZ_FILE;			// TKE anisotropy tensor: [C]
+												// anisotropy tensor of TKE: [(u(i)u(j)) / (u(k)u(k))] - 1/3 * delta(i,j)
+
+	std::string TKE_EXCHANGE_PZ_FILE;			// TKE exchange-balance: [C]
+												// p'*du'[i]/dx[i] and 2 * pressure-strain = 2*p'*S'ij
+
+
+	std::string UV_FLUX_BUDGET_PZ_FILE;			// u'v' flux-balance: [C]
+	std::string UW_FLUX_BUDGET_PZ_FILE;			// u'w' flux-balance: [W]
+	std::string VW_FLUX_BUDGET_PZ_FILE;			// v'w' flux-balance: [W]
+												// production + diffusion + pressure-strain + [buoyancy] + dissipation = 0
+												// includes: 
+												//   iso-dissipation 
+												//   diffusion terms (transport and pressure work)
+												//   production components
+
+
+	std::string ROTTA_MODEL_PZ_FILE;			// Rotta type models constants: [C]
+
+
+	std::string TIME_SCALE_TURBULENT_PZ_FILE;		// T = TKE / e(TKE): [C]
+
+	std::string LENGTH_SCALE_MIXING_PZ_FILE;		// L = (u'(i)*u'(i))^1/2 / (dU/dz): [W]
+	std::string LENGTH_SCALE_KOLMOGOROV_PZ_FILE;	// L = [ nu^3 / e(TKE) ]^1/4: [C] 
+
+
+#ifdef STRATIFICATION
+	std::string T_AVERAGE_PZ_FILE;					// T(z): [C]
+	
+	std::string T_DEVIATION_PZ_FILE;				// (T'T')^1/2: [C]
+	
+	std::string TU_FLUX_PZ_FILE, TV_FLUX_PZ_FILE,	// T'u'(z), T'v'(z): [C, C] 
+		TW_FLUX_PZ_FILE;							// T'w'(z): [W]
+	
+	std::string T_GRAD_PZ_FILE;						// dT(z)/dz: [W]
+
+	std::string T_PRESSURE_GRADIENT_U_PZ_FILE;	// T'*dP'/dx: [C]
+	std::string T_PRESSURE_GRADIENT_V_PZ_FILE;	// T'*dP'/dy: [C]
+	std::string T_PRESSURE_GRADIENT_W_PZ_FILE;	// T'*dP'/dz: [W]
+
+
+#ifdef COMPUTE_XT_AVERAGES
+	std::string T_AVERAGE_PYZ_FILE;			// T(y,z): [C]
+	
+	std::string T_DEVIATION_PYZ_FILE;		// (T'T')^1/2: [C]
+	
+	std::string TW_FLUX_PYZ_FILE;			// T'w'(y,z): [W]
+
+	std::string T_GRAD_PYZ_FILE;			// dT(y,z)/dz: [W]
+#endif
+
+
+	std::string HEAT_BALANCE_PZ_FILE;			// -T'w' + (1/Re) * (1/Pr) * dT/dz = const: [W]
+												// heat flux balance: turbulent heat flux + heat stress = const
+
+	std::string TVA_BALANCE_PZ_FILE;			// TVA (scalar variance equation) stationary balance: [C]
+												// production + diffusion + dissipation = 0
+												// includes: iso-dissipation
+	
+	std::string TPE_BALANCE_PZ_FILE;			// TPE stationary balance: [C]
+												// tutbulent potential energy: TVA * (Ri(b)/[dT/dz])
+												// [energy-heat flux] + diffusion + dissipation = 0
+												// includes: iso-dissipation
+
+
+	std::string TU_FLUX_BUDGET_PZ_FILE;			// T'u' budget equation: [C]
+	std::string TV_FLUX_BUDGET_PZ_FILE;			// T'v' budget equation: [C]
+	std::string TW_FLUX_BUDGET_PZ_FILE;			// T'w' budget equation: [W]
+												// production + diffusion + pressure-grad(T) + [buoyancy] + dissipation = 0
+												// includes: 
+												//   diffusion terms (transport and pressure work)
+												//   production components
+
+
+	std::string PRANDTL_TURBULENT_PZ_FILE;		// (u'w'*dT/dz) / (T'w'*dU/dz): [W]
+	std::string RICHARDSON_GRADIENT_PZ_FILE;	// Ri(b) * [(dT/dz)/(dU/dz)^2]: [W]
+#ifdef COMPUTE_XT_AVERAGES
+	std::string RICHARDSON_GRADIENT_PYZ_FILE;	// Ri(b) * [(dT/dz)/(dU/dz)^2]: [W]
+#endif
+	std::string RICHARDSON_FLUX_PZ_FILE;		// Ri(b) * [T'w'/(u'w'*dU/dz)]: [W]
+	std::string REYNOLDS_BUOYANCY_PZ_FILE;		// e(TKE) / (nu * N^2): [C]
+	std::string FROUDE_HORIZONTAL_PZ_FILE;		// (1/N)*[0.5*e(TKE)/(E(u)+E(v))]: [C]
+
+
+	std::string TIME_SCALE_TVARIANCE_PZ_FILE;	// T = TVA / e(TVA): [C]
+
+	std::string LENGTH_SCALE_ELLISON_PZ_FILE;	// (T'*T')^1/2 / (dT/dz): [W]
+	std::string LENGTH_SCALE_OZMIDOV_PZ_FILE;	// ( e(TKE) / N^3 )^1/2, N = (Ri(b)*dT/dz)^1/2: [C]
+	std::string LENGTH_SCALE_OBUKHOV_PZ_FILE;	// (|u'w'|)^3/2 / (Ri * |T'w'|): [W]
+
+
+	std::string MIXING_EFFICIENCY_PZ_FILE;		// e(TPE) / e(TKE): [C]
+
+	std::string TURB_PRODUCTION_RATIO_PZ_FILE;	// P(TKE) / P(TVA): [C]
+#endif
+
+
+#ifdef FOURIER_SPECTRUM
+	std::string FOURIER_FILE;
+#endif
+	// ------------------------------------------------------------------------------------------------ //
+
+	bool set_filenames(const std::string& _DIR);
+
+	statsOutput();
+	~statsOutput();
+};
+// -------------------------------------------------------------------------------------------- //
+
+// Implementation
+// -------------------------------------------------------------------------------------------- //
+inline statsOutput::statsOutput() {}
+inline statsOutput::~statsOutput() {}
+// -------------------------------------------------------------------------------------------- //
+
+inline bool statsOutput::set_filenames(const std::string& _DIR)
+{
+	if (!nse::create_dir(_DIR)) return false;
+	DIR = _DIR;
+
+	// ------------------------------------------------------------------------------------------------ //
+	U_AVERAGE_PZ_FILE = DIR + "-U(z)-average-.plt";			// U(z): [C]
+	V_AVERAGE_PZ_FILE = DIR + "-V(z)-average-.plt";			// V(z): [C]
+	W_AVERAGE_PZ_FILE = DIR + "-W(z)-average-.plt";			// W(z): [W]
+
+	U_AVERAGE_VISC_PZ_FILE = DIR + "-U(z)-average-uvisc-.plt";		// U+(z+): [C]
+
+	U_DEVIATION_PZ_FILE = DIR + "-U(z)-deviation-.plt";				// (u'u')^1/2: [C]
+	V_DEVIATION_PZ_FILE = DIR + "-V(z)-deviation-.plt";				// (v'v')^1/2: [C]
+	W_DEVIATION_PZ_FILE = DIR + "-W(z)-deviation-.plt";				// (w'w')^1/2: [W]
+
+	UV_FLUX_PZ_FILE = DIR + "-UV(z)-flux-.plt";				// u'v'(z): [C]
+	UW_FLUX_PZ_FILE = DIR + "-UW(z)-flux-.plt";				// u'w'(z): [W]
+	VW_FLUX_PZ_FILE = DIR + "-VW(z)-flux-.plt";				// v'w'(z): [W]
+
+	U_GRAD_PZ_FILE = DIR + "-U(z)-grad-.plt";				// dU(z)/dz: [W]
+
+
+#ifdef COMPUTE_XT_AVERAGES
+	U_AVERAGE_PYZ_FILE = DIR + "-U(y,z)-average-.plt";					// U(y,z): [C]
+	V_W_AVERAGE_PYZ_FILE = DIR + "-V(y,z)-W(y,z)-average-.plt";			// V(y,z) & W(y,z): [V,W]
+
+	U_DEVIATION_PYZ_FILE = DIR + "-U(y,z)-deviation-.plt";				// (u'u')^1/2: [C]
+	V_W_DEVIATION_PYZ_FILE = DIR + "-V(y,z)-W(y,z)-deviation-.plt";		// (v'v')^1/2 & (w'w')^1/2: [V,W]
+
+	UV_FLUX_PYZ_FILE = DIR + "-UV(y,z)-flux-.plt";			// u'v'(y,z): [V]
+	UW_FLUX_PYZ_FILE = DIR + "-UW(y,z)-flux-.plt";			// u'w'(y,z): [W]
+	VW_FLUX_PYZ_FILE = DIR + "-VW(y,z)-flux-.plt";			// v'w'(y,z): [VW]
+
+	U_GRAD_PYZ_FILE = DIR + "-U(y,z)-grad-.plt";			// dU(y,z)/dz: [W]
+#endif
+
+
+	MOMENTUM_BALANCE_PZ_FILE = DIR + "-momentum(z)-balance-.plt";		// -u'w' + (1/Re) * dU/dz = (u*)^2: [W]
+
+	U_TKE_BALANCE_PZ_FILE = DIR + "-u-TKE(z)-balance-.plt";		// -u TKE balance: [C]
+	V_TKE_BALANCE_PZ_FILE = DIR + "-v-TKE(z)-balance-.plt";		// -v TKE balance: [C]
+	W_TKE_BALANCE_PZ_FILE = DIR + "-w-TKE(z)-balance-.plt";		// -w TKE balance: [C]
+
+	TKE_BALANCE_PZ_FILE = DIR + "-TKE(z)-balance-.plt";		// TKE stationary balance: [C]
+
+
+	ENERGY_STRUCTURE_PZ_FILE = DIR + "-energy(z)-structure-.plt";	// TKE, 1/2*u'[i]*u'[i], TPE & shares: [C]
+
+	TKE_ANISOTROPY_PZ_FILE = DIR + "-TKE(z)-anisotropy-.plt";		// TKE anisotropy tensor: [C]
+
+	TKE_EXCHANGE_PZ_FILE = DIR + "-TKE(z)-exchange-.plt";			// TKE exchange-balance: [C]
+
+
+	UV_FLUX_BUDGET_PZ_FILE = DIR + "-UV(z)-flux-balance-.plt";		// u'v' flux-balance: [C]
+	UW_FLUX_BUDGET_PZ_FILE = DIR + "-UW(z)-flux-balance-.plt";		// u'w' flux-balance: [W]
+	VW_FLUX_BUDGET_PZ_FILE = DIR + "-VW(z)-flux-balance-.plt";		// v'w' flux-balance: [W]
+
+
+	ROTTA_MODEL_PZ_FILE = DIR + "-Rotta(z)-model-.plt";			// Rotta type models constants: [C]
+
+
+	TIME_SCALE_TURBULENT_PZ_FILE = DIR + "-time-scale(z)-turbulent-.plt";		// T = TKE / e(TKE): [C]
+
+	LENGTH_SCALE_MIXING_PZ_FILE = DIR + "-length-scale(z)-mixing-.plt";			// L = (u'(i)*u'(i))^1/2 / (dU/dz): [W]
+	LENGTH_SCALE_KOLMOGOROV_PZ_FILE = DIR + "-length-scale(z)-kolmogorov-.plt";	// L = [ nu^3 / e(TKE) ]^1/4: [C] 
+
+
+#ifdef STRATIFICATION
+	T_AVERAGE_PZ_FILE = DIR + "-T(z)-average-.plt";			// T(z): [C]
+	
+	T_DEVIATION_PZ_FILE = DIR + "-T(z)-deviation-.plt";		// (T'T')^1/2: [C]
+	
+	TU_FLUX_PZ_FILE = DIR + "-TU(z)-flux-.plt";				// T'u'(z): [C]
+	TV_FLUX_PZ_FILE = DIR + "-TV(z)-flux-.plt";				// T'v'(z): [C]
+	TW_FLUX_PZ_FILE = DIR + "-TW(z)-flux-.plt";				// T'w'(z): [W]
+	
+	T_GRAD_PZ_FILE = DIR + "-T(z)-grad-.plt";				// dT(z)/dz: [W]
+
+	T_PRESSURE_GRADIENT_U_PZ_FILE = DIR + "-T-pressure-gradient(z)-u-.plt";	// T'*dP'/dx: [C]
+	T_PRESSURE_GRADIENT_V_PZ_FILE = DIR + "-T-pressure-gradient(z)-v-.plt";	// T'*dP'/dy: [C]
+	T_PRESSURE_GRADIENT_W_PZ_FILE = DIR + "-T-pressure-gradient(z)-w-.plt";	// T'*dP'/dz: [W]
+
+
+#ifdef COMPUTE_XT_AVERAGES
+	T_AVERAGE_PYZ_FILE = DIR + "-T(y,z)-average-.plt";			// T(y,z): [C]
+	
+	T_DEVIATION_PYZ_FILE = DIR + "-T(y,z)-deviation-.plt";		// (T'T')^1/2: [C]
+	
+	TW_FLUX_PYZ_FILE = DIR + "-TW(y,z)-flux-.plt";				// T'w'(y,z): [W]
+
+	T_GRAD_PYZ_FILE = DIR + "-T(y,z)-grad-.plt";				// dT(y,z)/dz: [W]
+#endif
+
+
+	HEAT_BALANCE_PZ_FILE = DIR + "-heat(z)-balance-.plt";		// -T'w' + (1/Re) * (1/Pr) * dT/dz = const: [W]
+	
+	TVA_BALANCE_PZ_FILE = DIR + "-TVA(z)-balance-.plt";			// TVA (scalar variance equation) stationary balance: [C]
+	
+	TPE_BALANCE_PZ_FILE = DIR + "-TPE(z)-balance-.plt";			// TPE stationary balance: [C]
+
+
+	TU_FLUX_BUDGET_PZ_FILE = DIR + "-TU(z)-flux-balance-.plt";	// T'u' budget: [C]
+	TV_FLUX_BUDGET_PZ_FILE = DIR + "-TV(z)-flux-balance-.plt";	// T'v' budget: [C]
+	TW_FLUX_BUDGET_PZ_FILE = DIR + "-TW(z)-flux-balance-.plt";	// T'w' budget: [W]
+
+
+	PRANDTL_TURBULENT_PZ_FILE = DIR + "-Pr(z)-turbulent-.plt";		// (u'w'*dT/dz) / (T'w'*dU/dz): [W]
+	RICHARDSON_GRADIENT_PZ_FILE = DIR + "-Ri(z)-gradient-.plt";		// Ri(b) * [(dT/dz)/(dU/dz)^2]: [W]
+#ifdef COMPUTE_XT_AVERAGES
+	RICHARDSON_GRADIENT_PYZ_FILE = DIR + "-Ri(y,z)-gradient-.plt";	// Ri(b) * [(dT/dz)/(dU/dz)^2]: [W]
+#endif
+	RICHARDSON_FLUX_PZ_FILE = DIR + "-Ri(z)-flux-.plt";				// Ri(b) * [T'w'/(u'w'*dU/dz)]: [W]
+	REYNOLDS_BUOYANCY_PZ_FILE = DIR + "-Re(z)-buoyancy-.plt";		// e(TKE) / (nu * N^2): [C]
+	FROUDE_HORIZONTAL_PZ_FILE = DIR + "-Fr(z)-horizontal-.plt";		// (1/N)*[0.5*e(TKE)/(E(u)+E(v))]: [C]
+
+
+	TIME_SCALE_TVARIANCE_PZ_FILE = DIR + "-time-scale(z)-T-variance-.plt";		// T = TVA / e(TVA): [C]
+
+	LENGTH_SCALE_ELLISON_PZ_FILE = DIR + "-length-scale(z)-ellison-.plt";		// (T'*T')^1/2 / (dT/dz): [W]
+	LENGTH_SCALE_OZMIDOV_PZ_FILE = DIR + "-length-scale(z)-ozmidov-.plt";		// ( e(TKE) / N^3 )^1/2, N = (Ri(b)*dT/dz)^1/2: [C]
+	LENGTH_SCALE_OBUKHOV_PZ_FILE = DIR + "-length-scale(z)-obukhov-.plt";		// (|u'w'|)^3/2 / (Ri * |T'w'|): [W]	
+
+
+	MIXING_EFFICIENCY_PZ_FILE = DIR + "-mixing-efficiency(z)-.plt";			// e(TPE) / e(TKE): [C]
+
+	TURB_PRODUCTION_RATIO_PZ_FILE = DIR + "-turb-production-ratio(z)-.plt";	// P(TKE) / P(TVA): [C]
+#endif
+
+
+#ifdef FOURIER_SPECTRUM
+	FOURIER_FILE = DIR + "-DFT-UU(k)-.plt";
+#endif
+	// ------------------------------------------------------------------------------------------------ //
+	
+	return true;
+}
+// -------------------------------------------------------------------------------------------- //
diff --git a/str-com.cpp b/str-com.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b5d1a9dd8192186f89ac13e8d90b31b90706fe4
--- /dev/null
+++ b/str-com.cpp
@@ -0,0 +1,549 @@
+#include "str-com.h"
+
+#define _CRT_SECURE_NO_DEPRECATE
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <math.h>
+#include <limits.h>
+
+// include for directory manip.
+#ifdef _WIN32
+#include <direct.h>
+#else
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+#include <errno.h>
+
+#include <sstream>
+
+// Implementation //
+char* nse::append_index( // append index to string
+	const char* name, const int index)
+{
+	char* copy;
+	copy = new char[strlen(name) + 1]; strcpy(copy, name);
+
+	// find last occurence of '.' signaling file name extension
+	char* ptr; ptr = strrchr(copy, '.');
+	if (ptr == NULL) { delete[] copy; return NULL; }
+
+	(*ptr) = '\0'; ptr++; // set pointer to extension part of string
+
+						  // deterimine index digits amount
+	int sign = (index >= 0) ? 1 : -1;
+	int indexSize = (sign == -1);
+	int indexCopy = sign * index;
+	do
+	{
+		indexCopy /= 10; indexSize++;
+	} while (indexCopy > 0);
+
+	int size = strlen(copy) + 1 + strlen(ptr) + indexSize;
+	char* app_name = new char[size + 1];
+	sprintf(app_name, "%s%i.%s", copy, index, ptr);
+
+	delete[] copy;
+	return app_name;
+}
+
+const std::string nse::append_index(	// append index to string
+	const std::string& name, const int index)
+{
+	std::ostringstream sidx;
+	sidx << index;
+
+	const int ext_pos = name.find_last_of(".");
+	std::string app_name = name;
+	app_name.insert(ext_pos, sidx.str());
+
+	return app_name;
+}
+
+const std::string nse::append_index(	// append index to string
+	const std::string& name, 
+	const std::string& sub, const int index)
+{
+	std::ostringstream sidx;
+	sidx << sub << index;
+
+	const int ext_pos = name.find_last_of(".");
+	std::string app_name = name;
+	app_name.insert(ext_pos, sidx.str());
+
+	return app_name;
+}
+
+const std::string nse::append_string(	// append string to string
+	const std::string& name,
+	const std::string& sub)
+{
+	const int ext_pos = name.find_last_of(".");
+	std::string app_name = name;
+	app_name.insert(ext_pos, sub);
+
+	return app_name;
+}
+
+bool nse::copy_file( // copy file
+	const std::string& filename_src, const std::string& filename_dest)
+{
+	const int chunk_size = 32 * 1024;
+	char chunk[chunk_size];
+
+
+	FILE *file_src, *file_dest;
+
+	file_src = fopen(filename_src.c_str(), "rb");
+	if (file_src == NULL) return false;
+
+	file_dest = fopen(filename_dest.c_str(), "wb");
+	if (file_dest == NULL) {
+		fclose(file_src);
+		return false;
+	}
+
+	int nsize;
+	while (!feof(file_src)) {
+		nsize = fread(chunk, sizeof(char), chunk_size, file_src);
+		if ((nsize > 0) && (nsize <= chunk_size)) {
+			fwrite(chunk, sizeof(char), nsize, file_dest);
+		}
+	}
+
+	fclose(file_src);
+	fclose(file_dest);
+
+	return true;
+}
+
+bool nse::copy_file( // copy file
+	const std::string& filename_src,
+	const std::string& filename_dest, const int dest_index)
+{
+	return copy_file(filename_src, append_index(filename_dest, dest_index));
+}
+
+bool nse::copy_file( // copy file
+	const std::string& filename_src, const int src_index,
+	const std::string& filename_dest)
+{
+	return copy_file(append_index(filename_src, src_index), filename_dest);
+}
+
+bool nse::copy_file( // copy file
+	const std::string& filename_src, const int src_index,
+	const std::string& filename_dest, const int dest_index)
+{
+	return copy_file(append_index(filename_src, src_index),
+		append_index(filename_dest, dest_index));
+}
+
+bool nse::create_dir(const std::string& path) {
+	int status;
+#ifdef _WIN32
+	status = _mkdir(path.c_str());
+#else
+	status = mkdir(path.c_str(), S_IRWXU);	// add group?: S_IRWXG
+#endif
+
+	return ((status == 0) ||
+		((status == -1) && (errno == EEXIST)));
+}
+
+bool nse::remove_empty_dir(const std::string& path)
+{
+	int status;
+#ifdef _WIN32
+	status = _rmdir(path.c_str());
+#else
+	status = rmdir(path.c_str());
+#endif
+
+	return (status == 0);
+}
+
+bool nse::is_integer(const char* token, int* value)
+{
+	const int mode = 10;
+	char* pend;
+
+	if (strlen(token) == 0) return false;
+
+	errno = 0;
+	long lvalue = strtol(token, &pend, mode);
+	if ((errno == ERANGE && (lvalue == LONG_MAX || lvalue == LONG_MIN)) ||
+		(errno != 0 && lvalue == 0)) {
+		return false;
+	}
+	// checking integer bounds //
+	if ((lvalue >= INT_MAX) || (lvalue <= INT_MIN)) return false;
+	if (*pend != '\0') return false;
+
+	*value = (int)lvalue;
+	return true;
+}
+
+bool nse::is_integer(const char* token)
+{
+	int value;
+	return is_integer(token, &value);
+}
+
+bool nse::is_double(const char* token, double* value)
+{
+	char* pend;
+
+	if (strlen(token) == 0) return false;
+
+	errno = 0;
+	*value = strtod(token, &pend);
+	if ((errno == ERANGE && (*value == HUGE_VAL || *value == -HUGE_VAL)) ||
+		(errno != 0 && *value == (double)0)) {
+		return false;
+	}
+
+	if (*pend != '\0') return false;
+	return true;
+}
+
+bool nse::is_double(const char* token)
+{
+	double value;
+	return is_double(token, &value);
+}
+
+bool nse::is_valid_c_name(const char* token)
+{
+	int token_length = strlen(token);
+
+	if (token_length == 0) return false;
+	if ((!isalpha(token[0])) && (token[0] != '_')) return false;
+
+	for (int i = 1; i < token_length; i++) {
+		if ((!isalnum(token[i])) && (token[i] != '_')) return false;
+	}
+
+	return true;
+}
+
+bool nse::is_valid_string(const char* token)
+{
+	int token_length = strlen(token);
+
+	if (token_length <= 1) return false;
+	if ((token[0] != '"') || (token[token_length - 1] != '"')) return false;
+
+	return true;
+}
+
+bool nse::is_boolean(const char* token, bool* value)
+{
+	if (!strcmp(token, "true")) {
+		*value = true;
+		return true;
+	}
+	if (!strcmp(token, "false")) {
+		*value = false;
+		return true;
+	}
+
+	return false;
+}
+
+bool nse::is_boolean(const char* token)
+{
+	bool value;
+	return is_boolean(token, &value);
+}
+
+char* nse::getline(char** buf, int* size, FILE* ptr)
+{
+	const int c_min_size = 512;
+
+	if (*size < c_min_size) {
+		if (*size > 0) delete[](*buf);
+
+		*size = c_min_size;
+		*buf = new char[*size];
+	}
+
+	if (fgets(&(*buf)[0], *size, ptr) == NULL) return NULL;
+	int bidx = strlen(*buf);
+
+	while (!feof(ptr) && ((*buf)[bidx - 1] != '\n')) {	// reading more
+
+														// allocating additional memory //
+		*size += c_min_size;
+		char *hbuf = new char[*size];
+		strcpy(hbuf, *buf);
+
+		// pointing buf to new memory //
+		delete[](*buf);
+		*buf = hbuf;
+
+		// reading & removing termination character
+		if (fgets(&(*buf)[bidx], *size - bidx, ptr) == NULL) {
+			if (!feof(ptr)) return NULL;
+		}
+		bidx = strlen(*buf);
+	}
+
+	return *buf;
+}
+
+void nse::append_dirname(char** name, const char* dir)
+{
+	int ptr = strlen(*name);
+	int nsize = ptr + strlen(dir) + 1;
+
+	char *buf = new char[nsize];
+	strcpy(buf, dir);
+	strcat(buf, *name);
+
+	delete[] * name;
+	*name = buf;
+}
+
+void nse::append_name(char** name, int* mem_size,
+	const char* subname, const char delimiter)
+{
+	int ptr = strlen(*name);
+	int nsize = ptr + strlen(subname) + 2;
+
+	if (nsize > *mem_size)
+	{
+		*mem_size = nsize;
+
+		char *buf = new char[*mem_size];
+		strcpy(buf, *name);
+
+		delete[] * name;
+		*name = buf;
+	}
+
+	if (ptr > 0) {
+		(*name)[ptr] = delimiter;
+		(*name)[ptr + 1] = '\0';
+	}
+	strcat(*name, subname);
+}
+
+void nse::truncate_name(char* name, const char delimiter)
+{
+	char *ptr = strrchr(name, delimiter);
+	if (ptr == NULL) *name = '\0';
+	else
+		*ptr = '\0';
+}
+
+void nse::strcpyrm(char* a, const char* b, const char sym)
+{
+	int i = 0;
+	for (int k = 0; k < (int)strlen(b); k++) {
+		if (b[k] == sym) continue;
+		a[i] = b[k];
+		i++;
+	}
+	a[i] = '\0';
+}
+
+// * convert to string * //
+// -------------------------------------------------------------------------------------------- //
+std::string nse::convert_to_string(const int value)
+{
+	std::ostringstream stream;
+	stream << value;
+	return stream.str();
+}
+
+std::string nse::convert_to_string(const double value)
+{
+	std::ostringstream stream;
+	stream << value;
+	return stream.str();
+}
+// -------------------------------------------------------------------------------------------- //
+
+//
+// Implementation - "FileParser" //
+//
+
+nse::FileParser::FileParser()
+{
+	ntokens = 0;
+	nalloc_tokens = c_alloc_init;
+
+	token = new char*[nalloc_tokens];
+	lineId = new int[nalloc_tokens];
+}
+nse::FileParser::~FileParser()
+{
+	ntokens = 0;
+	nalloc_tokens = 0;
+
+	for (int i = 0; i < ntokens; i++) {
+		delete[] token[i];
+	}
+	delete[] token;
+	delete[] lineId;
+}
+
+void nse::FileParser::add(const char* new_token, const int nline)
+{
+	if (strlen(new_token) == 0) return;
+
+	if (ntokens >= nalloc_tokens) {
+		char** hlist = new char*[nalloc_tokens + c_alloc_init];
+		int* hid = new int[nalloc_tokens + c_alloc_init];
+
+		memcpy(hlist, token, sizeof(char*)* nalloc_tokens);
+		memcpy(hid, lineId, sizeof(int)* nalloc_tokens);
+
+		delete[] token;
+		delete[] lineId;
+
+		token = hlist;
+		lineId = hid;
+
+		nalloc_tokens += c_alloc_init;
+	}
+
+	token[ntokens] = new char[strlen(new_token) + 1];
+	strcpy(token[ntokens], new_token);
+
+	lineId[ntokens] = nline;
+
+	ntokens++;
+}
+
+bool nse::FileParser::run(const char* filename,
+	const char comment_sym, const char* special_syms,
+	const bool check_string_quotes)
+{
+	ntokens = 0;
+
+	// - checking input parameters
+	if ((isspace(comment_sym)) || (comment_sym == '\0') ||
+		((comment_sym == '"') && check_string_quotes) ||
+		(strchr(special_syms, comment_sym) != NULL))
+	{
+		printf(" PARSE:> incorrect comments symbol: '%c'\n",
+			comment_sym);
+		return false;
+	}
+	for (int i = 0; i < (int)strlen(special_syms); i++) {
+		if ((isspace(special_syms[i])) ||
+			((special_syms[i] == '"') && check_string_quotes))
+		{
+			printf(" PARSE:> incorrect special symbol: '%c'\n",
+				special_syms[i]);
+		}
+	}
+
+	FILE *ptr = fopen(filename, "rt");
+	if (ptr == NULL) {
+		printf(" PARSE:> failed to open file: '%s'\n", filename);
+		return false;
+	}
+
+	char *buf;
+	int buf_size = 0;
+
+	int nline = 1;
+	bool is_error = false;
+
+	while ((!is_error) && (getline(&buf, &buf_size, ptr) != NULL))
+	{
+		// - removing comments from line
+		char *is_comment = strchr(buf, comment_sym);
+		if (is_comment != NULL) *is_comment = '\0';
+
+		// - parsing line char-by-char
+		//		strtok is messy for handling special symbols and quotes
+		char *pbuf_begin = buf, *pbuf_end;
+		while (*pbuf_begin != '\0') {
+
+			while (isspace(*pbuf_begin))
+				pbuf_begin++;
+
+			if (*pbuf_begin == '\0') break;	// end of string check
+
+			pbuf_end = pbuf_begin + 1;
+			if (check_string_quotes && (*pbuf_begin == '"'))
+			{
+				// handling strings in quotes
+				while ((*pbuf_end != '\0') && (*pbuf_end != '"'))
+				{
+					pbuf_end++;
+				}
+
+				if (*pbuf_end == '\0') {	// missing closing quote
+					is_error = true;
+					printf(" PARSE:> missing string closing quote: '%s' (line, %i)\n",
+						pbuf_begin, nline);
+					break;	// exiting on failure
+				}
+				pbuf_end++;	// adding closing quote to token
+			}
+			else
+			{
+				// special symbols are delimiters
+				int is_begin_special =
+					(strchr(special_syms, *pbuf_begin) != NULL);
+
+				while ((!isspace(*pbuf_end)) &&
+					(*pbuf_end != '\0') &&
+					(!is_begin_special) &&
+					(strchr(special_syms, *pbuf_end) == NULL))
+				{
+					pbuf_end++;
+				}
+			}
+
+			// adding string to list of tokens
+			char end_sym = *pbuf_end;
+			*pbuf_end = '\0';
+
+			add(pbuf_begin, nline);
+			*pbuf_end = end_sym;
+
+			pbuf_begin = pbuf_end;
+		}
+		nline++;
+	}
+
+	if (is_error) ntokens = 0;
+	if (buf_size > 0) delete[] buf;
+
+	fclose(ptr);
+	return !is_error;
+}
+
+int nse::FileParser::get_ntokens() const
+{
+	return ntokens;
+}
+
+const char* nse::FileParser::get_token(const int idx) const
+{
+	if ((idx < 0) || (idx >= ntokens)) return NULL;
+	return token[idx];
+}
+
+int nse::FileParser::get_line_num(const int idx) const
+{
+	if ((idx < 0) || (idx >= ntokens)) return 0;
+	return lineId[idx];
+}
+
+void nse::FileParser::print() const
+{
+	for (int i = 0; i < ntokens; i++) {
+		printf(" PARSE:> token[%i] (line, %i): '%s'\n", i, lineId[i], token[i]);
+		getc(stdin);
+	}
+}
diff --git a/str-com.h b/str-com.h
new file mode 100644
index 0000000000000000000000000000000000000000..b540acff0ec005deb197726b5c615d0eea958430
--- /dev/null
+++ b/str-com.h
@@ -0,0 +1,106 @@
+#pragma once
+
+// [str-com.h]: general nse subroutines for file & string operations
+//
+// -------------------------------------------------------------------------------------------- //
+
+#define _CRT_SECURE_NO_DEPRECATE
+#include <stdio.h>
+#include <string>
+
+namespace nse
+{
+	char* append_index(const char* name, const int index);
+	const std::string append_index(const std::string& name, const int index);
+	const std::string append_index(const std::string& name,
+		const std::string& sub, const int index);
+	const std::string append_string(const std::string& name,
+		const std::string& sub);
+
+	bool copy_file(const std::string& filename_src, const std::string& filename_dest);
+	bool copy_file(const std::string& filename_src,
+		const std::string& filename_dest, const int dest_index);
+	bool copy_file(const std::string& filename_src, const int src_index,
+		const std::string& filename_dest);
+	bool copy_file(const std::string& filename_src, const int src_index,
+		const std::string& filename_dest, const int dest_index);
+
+	// creater directory on last level of path
+	//		<true> - on create and if exists
+	bool create_dir(const std::string& path);
+	bool remove_empty_dir(const std::string& path);
+
+
+	bool is_integer(const char* token, int* value);
+	bool is_integer(const char* token);
+
+	bool is_double(const char* token, double* value);
+	bool is_double(const char* token);
+
+	bool is_valid_c_name(const char* token);
+
+	bool is_valid_string(const char* string);
+
+	bool is_boolean(const char* token, bool* value);
+	bool is_boolean(const char* token);
+
+
+	// - fgets() for arbitrary line length
+	//			<buf> - pointer to current buffer
+	//			<size> - current buffer size
+	char* getline(char** buf, int* size, FILE* ptr);
+
+	// name = dir + name
+	void append_dirname(char** name, const char* dir);
+
+	// name += [delimiter]subname
+	void append_name(char** name, int* mem_size,
+		const char* subname, const char delimiter);
+	// name -= (last)[delimiter]subname part
+	void truncate_name(char* name, const char delimiter);
+
+	// copy a = b and remove [sym]
+	void strcpyrm(char* a, const char* b, const char sym);
+
+	// convert int to string
+	// *: implemented due to problems with std::to_string on some gcc compilers
+	// -------------------------------------------------------------------------------------------- //
+	std::string convert_to_string(const int value);
+	std::string convert_to_string(const double value);
+	// -------------------------------------------------------------------------------------------- //
+
+}
+
+namespace nse
+{
+	class FileParser {
+	public:		// interface
+		FileParser();
+		~FileParser();
+
+		bool run(const char* filename,
+			const char comment_sym, const char* special_syms,
+			const bool check_string_quotes);
+
+		int get_ntokens() const;
+
+		const char* get_token(const int idx) const;
+		int get_line_num(const int idx) const;
+
+		void print() const;
+
+	private:	// interface
+
+		void add(const char* token, const int nline);
+
+	private:	// data
+
+		static const int c_alloc_init = 256;
+
+		int nalloc_tokens;
+		int ntokens;
+
+		char **token;
+		int *lineId;
+	};
+}
diff --git a/time-series.cpp b/time-series.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bcf3b3c53a9a9b893681142db86da6fc8c9279e
--- /dev/null
+++ b/time-series.cpp
@@ -0,0 +1,276 @@
+#define _CRT_SECURE_NO_DEPRECATE
+#include "time-series.h"
+
+#include "nse-alloc.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+nse::timeSeries::timeSeries() : nvar(0), size(0), ptr(0) { }
+nse::timeSeries::timeSeries(const timeSeries& series)
+{
+	if ((series.nvar > 0) && (series.size > 0)) {
+		nvar = series.nvar;
+		size = series.size; ptr = series.ptr;
+
+		allocate(&data, (nvar + 1) * size);
+		varname = new char*[nvar];
+		for (int k = 0; k < nvar; k++) {
+			varname[k] = new char[c_max_name_length + 1];
+			strcpy(varname[k], series.varname[k]);
+		}
+		mcopy(data, series.data, (nvar + 1) * size);
+	}
+}
+nse::timeSeries :: ~timeSeries()
+{
+	if (nvar > 0) {
+		for (int k = 0; k < nvar; k++)
+			delete[] varname[k];
+		delete[] varname;
+		deallocate(data);
+
+		nvar = 0;
+		size = 0; ptr = 0;
+	}
+}
+
+bool nse::timeSeries::set(const int n)
+{
+	if ((n <= 0) || (size > 0)) return false;
+
+	nvar = n;
+	size = c_seq_init_length; ptr = 0;
+
+	allocate(&data, (nvar + 1) * size);
+	varname = new char*[nvar];
+	for (int k = 0; k < nvar; k++) {
+		varname[k] = new char[c_max_name_length + 1];
+		strncpy(varname[k], "\0", c_max_name_length + 1);
+	}
+
+	return true;
+}
+
+bool nse::timeSeries::init(const std::string& filename) const
+{
+	FILE* file_ptr;
+	file_ptr = fopen(filename.c_str(), "wb");
+	if (file_ptr == NULL) return false;
+
+	int nstatus = 0;
+	nstatus += fwrite(&nvar, sizeof(int), 1, file_ptr);
+	int name_length = c_max_name_length;
+	nstatus += fwrite(&name_length, sizeof(int), 1, file_ptr);
+
+	for (int k = 0; k < nvar; k++)
+		nstatus += fwrite(varname[k], sizeof(char), c_max_name_length + 1, file_ptr);
+
+	fclose(file_ptr);
+	return (nstatus == (nvar * (c_max_name_length + 1) + 2));
+}
+
+bool nse::timeSeries::init_append(const std::string& filename) const
+{
+	FILE* file_ptr;
+	file_ptr = fopen(filename.c_str(), "rb");
+	if (file_ptr == NULL) return false;
+
+	int nstatus = 0;
+	int check_nvar, check_name_length;
+	nstatus += fread(&check_nvar, sizeof(int), 1, file_ptr);
+	nstatus += fread(&check_name_length, sizeof(int), 1, file_ptr);
+
+	fclose(file_ptr);
+
+	if ((nstatus != 2) ||
+		(check_nvar != nvar) ||
+		(check_name_length != c_max_name_length)) return false;
+
+	return true;
+}
+
+void nse::timeSeries::reset() { ptr = 0; }
+
+long int nse::timeSeries::length() const { return ptr; }
+
+template< typename T >
+bool nse::timeSeries::push(const int idx, const T value)
+{
+	if ((idx < 0) || (idx > nvar - 1)) return false;
+
+	data[ptr * (nvar + 1) + idx + 1] = (double)value;
+	return true;
+}
+
+template bool nse::timeSeries::push(const int idx, const float value);
+template bool nse::timeSeries::push(const int idx, const double value);
+
+template< typename T >
+bool nse::timeSeries::push(const char* name, const T value)
+{
+	for (int k = 0; k < nvar; k++)
+		if (!strcmp(varname[k], name)) {
+			push(k, value);
+			return true;
+		}
+	return false;
+}
+
+template bool nse::timeSeries::push(const char* name, const float value);
+template bool nse::timeSeries::push(const char* name, const double value);
+
+template< typename T >
+bool nse::timeSeries::push_time(const T value)
+{
+	if ((nvar == 0) || (size == 0)) return false;
+
+	data[ptr * (nvar + 1)] = (double)value;
+	if (ptr == size - 1) {
+		double* save;
+		allocate(&save, (nvar + 1) * size);
+		mcopy(save, data, size * (nvar + 1));
+
+		deallocate(data);
+
+		allocate(&data, (size + c_seq_init_length) * (nvar + 1));
+		mcopy(data, save, size * (nvar + 1));
+		size += c_seq_init_length;
+
+		deallocate(save);
+	}
+
+	ptr++;
+	return true;
+}
+
+template bool nse::timeSeries::push_time(const float value);
+template bool nse::timeSeries::push_time(const double value);
+
+bool nse::timeSeries::name_variable(const int idx, const char* name)
+{
+	if ((idx < 0) || (idx > nvar - 1)) return false;
+
+	if (strlen(name) < c_max_name_length + 1)
+		strcpy(varname[idx], name);
+	else
+	{
+		strncpy(varname[idx], name, c_max_name_length);
+		varname[idx][c_max_name_length] = '\0';
+	}
+
+	return true;
+}
+
+bool nse::timeSeries::write(const std::string& filename) const
+{
+	if ((nvar == 0) || (size == 0)) return false;
+	if (ptr == 0) return true;
+
+	FILE* file_ptr;
+
+	file_ptr = fopen(filename.c_str(), "rb");
+	if (file_ptr == NULL) return false;
+
+	int nstatus = 0;
+	int check_nvar;
+	nstatus += fread(&check_nvar, sizeof(int), 1, file_ptr);
+	fclose(file_ptr);
+
+	if ((nstatus != 1) || (check_nvar != nvar)) return false;
+
+	file_ptr = fopen(filename.c_str(), "ab");
+	if (file_ptr == NULL) return false;
+
+	nstatus = 0;
+	nstatus += fwrite(data, sizeof(double), (nvar + 1) * ptr, file_ptr);
+
+	fclose(file_ptr);
+	return (nstatus == ((nvar + 1) * ptr));
+}
+
+bool nse::timeSeries::read_head(const std::string& filename)
+{
+	if (size > 0) return false;
+
+	FILE *file_ptr = fopen(filename.c_str(), "rb");
+	if (file_ptr == NULL) return false;
+
+	int nstatus;
+	int n, name_length;
+
+	nstatus = fread(&n, sizeof(int), 1, file_ptr);
+	if (nstatus != 1) { 
+		fclose(file_ptr); 
+		return false; 
+	}
+	if (!set(n)) { 
+		fclose(file_ptr); 
+		return false; 
+	}
+
+	nstatus = fread(&name_length, sizeof(int), 1, file_ptr);
+	if ((nstatus != 1) || (name_length < 0)) { 
+		fclose(file_ptr); 
+		return false; 
+	}
+
+	char* name = new char[name_length + 1];
+	for (int k = 0; k < nvar; k++) {
+		nstatus = fread(name, sizeof(char), name_length + 1, file_ptr);
+		if (nstatus != name_length + 1) {
+			delete[] name;
+			fclose(file_ptr);
+			return false;
+		}
+		if (!name_variable(k, name)) {
+			delete[] name;
+			fclose(file_ptr);
+			return false;
+		}
+	}
+
+	delete[] name;
+	fclose(file_ptr);
+	return true;
+}
+
+bool nse::timeSeries::read(const std::string& filename)
+{
+	if (!read_head(filename)) return false;
+
+	FILE *file_ptr = fopen(filename.c_str(), "rb");
+	if (file_ptr == NULL) return false;
+
+	int nstatus;
+	int n, name_length;
+
+	nstatus = fread(&n, sizeof(int), 1, file_ptr);
+	if (nstatus != 1) {
+		fclose(file_ptr);
+		return false;
+	}
+	nstatus = fread(&name_length, sizeof(int), 1, file_ptr);
+	if (nstatus != 1) {
+		fclose(file_ptr);
+		return false;
+	}
+	fseek(file_ptr, (name_length + 1) * n, SEEK_CUR);
+
+	double *elem = new double[(nvar + 1)];
+
+	nstatus = fread(elem, sizeof(double), nvar + 1, file_ptr);
+	while (nstatus == nvar + 1) {
+		for (int k = 0; k < nvar; k++) push(k, elem[k]);
+		push_time(elem[nvar]);
+	}
+	delete[] elem;
+	if ((nstatus != 0) || (!feof(file_ptr))) {
+		fclose(file_ptr);
+		return false;
+	}
+
+	fclose(file_ptr);
+	return true;
+}
diff --git a/time-series.h b/time-series.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f904467e70f56fb753cc8d7787c50bf30019248
--- /dev/null
+++ b/time-series.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <string>
+
+// * time series container * //
+namespace nse
+{
+	class timeSeries {
+	public:
+		bool set(const int nvar);
+		void reset();	            // reset sequence to beginning
+
+		long int length() const;   // get sequence length
+
+								   // push variable referenced by index or name //
+		template< typename T >
+		bool push(const int idx, const T value);
+		template< typename T >
+		bool push(const char* name, const T value);
+		// push sequence time [ step to next element ] //
+		template< typename T >
+		bool push_time(const T value);
+
+		// name variable referenced by index
+		bool name_variable(const int idx, const char* name);
+
+		bool init(const std::string& filename) const;			// init: new output time series file
+		bool init_append(const std::string& filename) const;	// init: append to output time series file
+		bool write(const std::string& filename) const;			// write data to file
+
+		bool read_head(const std::string& filename);
+		bool read(const std::string& filename);
+
+		// processing
+		template< typename T >
+		bool minmax(T* vmin, T* vmax, const int idx) const;
+		template< typename T >
+		bool minmax(T* vmin, T* vmax, const int idx,
+			const T beg_time, const T end_time) const;
+
+		template< typename T >
+		bool stats(T* mean, T* rms, T* deviation, const int idx) const;
+		template< typename T >
+		bool stats(T* mean, T* rms, T* deviation, const int idx,
+			const T beg_time, const T end_time) const;
+
+		timeSeries();
+		timeSeries(const timeSeries& seq);
+		~timeSeries();
+
+	private:
+		static const int c_max_name_length = 32;
+		static const int c_seq_init_length = 16 * 1024;
+
+		int nvar;				// number of variables in sequence
+		long int size, ptr;		// sequence size and current length
+
+		double* data;	        // series data
+		char** varname;		    // variable names
+	};
+}
diff --git a/time-slice.cpp b/time-slice.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ea91b4869d95234ccaca2e2afcd6d0d6d958f735
--- /dev/null
+++ b/time-slice.cpp
@@ -0,0 +1,496 @@
+#define _CRT_SECURE_NO_DEPRECATE
+#include "time-slice.h"
+
+#include "mem-stx.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+// * Time-Slice [Running time average] * //
+nse::timeSlice::timeSlice() : psum_size(psum_size_default),
+psum_num(0), slice_ptr(0), slice_num(0),
+slice_size(0), psum_num_max(0)
+{
+	slice_list_time[0] = (double)0;
+	slice_list_time[1] = (double)0;
+}
+
+nse::timeSlice :: ~timeSlice() { clear(); }
+
+bool nse::timeSlice::set(const int _slice_size)
+{
+	return set(_slice_size,
+		psum_size_default, psum_num_init);
+}
+
+bool nse::timeSlice::set(const int _slice_size,
+	const int _psum_size)
+{
+	return set(_slice_size, _psum_size,
+		psum_num_init);
+}
+
+bool nse::timeSlice::set(const int _slice_size,
+	const int _psum_size,
+	const int _psum_num_max)
+{
+	if ((_slice_size <= 0) ||
+		(_psum_size <= 0) || (_psum_num_max <= 0)) return false;
+
+	clear();
+	slice_size = _slice_size;
+	psum_size = _psum_size;
+	psum_num_max = _psum_num_max;
+
+#ifndef EXCLUDE_SLICE_LIST
+	allocate(&slice_list, psum_size);
+	for (int k = 0; k < psum_size; k++)
+		allocate(&slice_list[k], slice_size);
+#endif
+
+	allocate(&slice_sum, slice_size);
+	for (int i = 0; i < slice_size; i++)
+		slice_sum[i] = (double)0;
+
+	allocate(&psum_list, psum_num_max);
+	for (int k = 0; k < psum_num_max; k++)
+		allocate(&psum_list[k], slice_size);
+
+	// time stamps //
+	slice_list_time[0] = (double)0;
+	slice_list_time[1] = (double)0;
+	allocate(&psum_time[0], &psum_time[1], psum_num_max);
+	for (int k = 0; k < psum_num_max; k++) {
+		psum_time[0][k] = (double)0;
+		psum_time[1][k] = (double)0;
+	}
+
+	return true;
+}
+
+void nse::timeSlice::clear()
+{
+	if (slice_size > 0) {
+
+#ifndef EXCLUDE_SLICE_LIST
+		for (int k = 0; k < psum_size; k++)
+			deallocate(slice_list[k]);
+		deallocate(slice_list);
+#endif
+		deallocate(slice_sum);
+
+		for (int k = 0; k < psum_num_max; k++)
+			deallocate(psum_list[k]);
+		deallocate(psum_list);
+
+		// time stamps //
+		deallocate(psum_time[0], psum_time[1]);
+	}
+
+	psum_num = 0;
+	psum_num_max = 0;
+	slice_ptr = 0;
+	slice_num = 0;
+	slice_size = 0;
+
+	// time stamps //
+	slice_list_time[0] = (double)0;
+	slice_list_time[1] = (double)0;
+}
+
+template< nse::memType mem, typename T1, typename T2 >
+bool nse::timeSlice::push(const T1* const X, const T2 time)
+{
+	if (slice_size <= 0) return false;
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) {
+		T1* buf;
+		int buf_id = memStx::get_buf(&buf, slice_size);
+
+		mcopy<memCPU, mem>(buf, X, slice_size);
+		bool status = push<memCPU>(buf, time);
+
+		memStx::free_buf(buf_id);
+		return status;
+	}
+#endif
+
+	int i;
+	double x_value;
+#pragma omp parallel for private(i, x_value)
+	for (i = 0; i < slice_size; i++) {	// adding new element to list
+		x_value = (double)X[i];
+
+#ifndef EXCLUDE_SLICE_LIST
+		slice_list[slice_ptr][i] = x_value;
+#endif
+		slice_sum[i] += x_value;
+	}
+
+	// slice time stamp
+	if (slice_ptr == 0) slice_list_time[0] = (double)time;
+	slice_list_time[1] = (double)time;
+
+	slice_ptr++;
+	slice_num++;
+
+	// checking if list is full
+	if (slice_ptr >= psum_size) {
+
+		// add partial sum of list at position [psum_num]
+		mcopy(psum_list[psum_num], slice_sum, slice_size);
+
+		const double div_value = (double) 1.0 / psum_size;
+#pragma omp parallel for private(i)
+		for (i = 0; i < slice_size; i++)
+			psum_list[psum_num][i] *= div_value;
+
+		// psum time stamp //
+		psum_time[0][psum_num] = slice_list_time[0];
+		psum_time[1][psum_num] = slice_list_time[1];
+
+		psum_num++;
+		// add additional memory to partial sums vector
+		if (psum_num >= psum_num_max) {
+
+			// psum additional memory 
+			double **ptr_psum;
+			allocate(&ptr_psum, psum_num_max + psum_num_add);
+			for (int k = 0; k < psum_num_max; k++) {
+				allocate(&ptr_psum[k], slice_size);
+				mcopy(ptr_psum[k], psum_list[k], slice_size);
+				deallocate(psum_list[k]);
+			}
+			deallocate(psum_list);
+			for (int k = psum_num_max; k < psum_num_max + psum_num_add; k++)
+				allocate(&ptr_psum[k], slice_size);
+
+			psum_list = ptr_psum;
+			// ----------------------------------
+
+			// psum time stamp additional memory
+			double *ptr_psum_time[2];
+			allocate(&ptr_psum_time[0], &ptr_psum_time[1], psum_num_max + psum_num_add);
+			mcopy(ptr_psum_time[0], psum_time[0], psum_num_max);
+			mcopy(ptr_psum_time[1], psum_time[1], psum_num_max);
+
+			deallocate(psum_time[0], psum_time[1]);
+			for (int k = psum_num_max; k < psum_num_max + psum_num_add; k++) {
+				ptr_psum_time[0][k] = (double)0;
+				ptr_psum_time[1][k] = (double)0;
+			}
+
+			psum_time[0] = ptr_psum_time[0];
+			psum_time[1] = ptr_psum_time[1];
+			// ----------------------------------
+
+			psum_num_max += psum_num_add;
+		}
+
+		slice_ptr = 0;
+#pragma omp parallel for private(i)
+		for (i = 0; i < slice_size; i++)
+			slice_sum[i] = (double)0;
+
+		// null slice list time stamp 
+		slice_list_time[0] = (double)0;
+		slice_list_time[1] = (double)0;
+	}
+
+	return true;
+}
+template bool nse::timeSlice::push<nse::memCPU>(const float* const X, const float time);
+template bool nse::timeSlice::push<nse::memCPU>(const float* const X, const double time);
+template bool nse::timeSlice::push<nse::memCPU>(const double* const X, const double time);
+template bool nse::timeSlice::push<nse::memCPU>(const double* const X, const float time);
+
+#ifndef EXCLUDE_GPU_BRANCH
+template bool nse::timeSlice::push<nse::memGPU>(const float* const X, const float time);
+template bool nse::timeSlice::push<nse::memGPU>(const float* const X, const double time);
+
+template bool nse::timeSlice::push<nse::memGPU>(const double* const X, const double time);
+template bool nse::timeSlice::push<nse::memGPU>(const double* const X, const float time);
+#endif
+
+template< typename T >
+void nse::timeSlice::average(T* X) const
+{
+	int i, k;
+	double sum_i;
+	const double psum_div =
+		(double)psum_size / (double)slice_num;
+
+#pragma omp parallel for private( i, k, sum_i ) shared( X )
+	for (i = 0; i < slice_size; i++) {
+
+		sum_i = (double)0;
+		for (k = 0; k < psum_num; k++)
+			sum_i += psum_list[k][i];
+		sum_i *= psum_div;
+
+		if (slice_ptr > 0)
+			sum_i += (slice_sum[i] / ((double)slice_ptr)) *
+			((double)slice_ptr / (double)slice_num);
+
+		X[i] = (T)sum_i;
+	}
+}
+template void nse::timeSlice::average(float* X) const;
+template void nse::timeSlice::average(double* X) const;
+
+template< typename T >
+void nse::timeSlice::average(T* X, const T time_begin, const T time_end) const
+{
+	int psum_begin = psum_num;
+	for (int k = 0; k < psum_num; k++)
+		if (psum_time[0][k] >= time_begin) {
+			psum_begin = k;
+			break;
+		}
+
+	int psum_end = -1;
+	for (int k = psum_num - 1; k >= 0; k--)
+		if (psum_time[1][k] <= time_end) {
+			psum_end = k;
+			break;
+		}
+
+	int use_current_list = (slice_ptr > 0) &&
+		(slice_list_time[0] >= time_begin) &&
+		(slice_list_time[1] <= time_end);
+
+	int nslice = (psum_end - psum_begin + 1) * psum_size +
+		use_current_list * slice_ptr;
+
+	int i, k;
+	double sum_i;
+	const double psum_div =
+		(double)psum_size / (double)nslice;
+
+#pragma omp parallel for private( i, k, sum_i ) shared( X ) \
+	shared(psum_begin, psum_end, use_current_list, nslice)
+	for (i = 0; i < slice_size; i++) {
+
+		sum_i = (double)0;
+		for (k = psum_begin; k <= psum_end; k++)
+			sum_i += psum_list[k][i];
+		sum_i *= psum_div;
+
+		if ((slice_ptr > 0) && (use_current_list))
+			sum_i += (slice_sum[i] / ((double)slice_ptr)) *
+			((double)slice_ptr / (double)nslice);
+
+		X[i] = (T)sum_i;
+	}
+}
+template void nse::timeSlice::average(float* X,
+	const float time_begin, const float time_end) const;
+template void nse::timeSlice::average(double* X,
+	const double time_begin, const double time_end) const;
+
+bool nse::timeSlice::write(const std::string& filename, const char* mode,
+	const char* name, const double time) const
+{
+	if (strcmp(mode, "wb") && strcmp(mode, "ab")) return false;
+
+	int ncount;
+	FILE *file_ptr = fopen(filename.c_str(), mode);
+	if (file_ptr == NULL) return false;
+
+	// write id //
+	int id = 'n' + 's' + 'e' +
+		's' + 'l' + 'i' + 'c' + 'e';
+	if (!strcmp(mode, "wb")) {
+		ncount = fwrite(&id, sizeof(int), 1, file_ptr);
+		if (ncount != 1) {
+			fclose(file_ptr);
+			return false;
+		}
+	}
+
+	int header[6];
+	header[0] = psum_size;		// size of partial sum
+	header[1] = psum_num_max;	// max number of available psums
+	header[2] = psum_num;		// number of current psums
+	header[3] = slice_ptr;		// current number of elements in working list
+	header[4] = slice_num;		// full number of elements(slices)
+	header[5] = slice_size;		// size of element(slice)
+
+								// write header //
+	ncount = fwrite(header, sizeof(int), 6, file_ptr);
+
+	// time stamp //
+	double time_stamp = time;
+	ncount += fwrite(&time_stamp, sizeof(double), 1, file_ptr);
+
+	// slice name //
+	int name_length = strlen(name);
+	ncount += fwrite(&name_length, sizeof(int), 1, file_ptr);
+	ncount += fwrite(name, sizeof(char), name_length, file_ptr);
+
+	// write partial sums //
+	for (int k = 0; k < psum_num; k++)
+		ncount += fwrite(psum_list[k], sizeof(double), slice_size, file_ptr);
+
+#ifndef EXCLUDE_SLICE_LIST
+	// write current elements in list //
+	for (int k = 0; k < slice_ptr; k++)
+		ncount += fwrite(slice_list[k], sizeof(double), slice_size, file_ptr);
+#endif
+
+	// write current partial sum //
+	ncount += fwrite(slice_sum, sizeof(double), slice_size, file_ptr);
+
+	// write time stamp //
+	ncount += fwrite(slice_list_time, sizeof(double), 2, file_ptr);
+	if (psum_num > 0) {
+		ncount += fwrite(psum_time[0], sizeof(double), psum_num, file_ptr);
+		ncount += fwrite(psum_time[1], sizeof(double), psum_num, file_ptr);
+	}
+
+	fclose(file_ptr);
+	return (ncount ==
+		(10 + name_length + 2 * psum_num +
+#ifndef EXCLUDE_SLICE_LIST
+		(psum_num + slice_ptr + 1) * slice_size));
+#else
+			(psum_num + 1) * slice_size));
+#endif
+}
+
+bool nse::timeSlice::read(const std::string& filename,
+	const int obj_offset)
+{
+	FILE *file_ptr = fopen(filename.c_str(), "rb");
+	if (file_ptr == NULL) return false;
+
+	const int id = 'n' + 's' + 'e' +
+		's' + 'l' + 'i' + 'c' + 'e';
+	int ncount;
+	int file_id, header[6];
+	double time_stamp;
+	int slice_name_length;
+	char* slice_name;
+
+	// read id //
+	ncount = fread(&file_id, sizeof(int), 1, file_ptr);
+	if ((ncount != 1) || (file_id != id)) {
+		fclose(file_ptr);
+		return false;
+	}
+
+	// seek file offset //
+	for (int m = 0; m < obj_offset; m++) {
+		ncount = fread(header, sizeof(int), 6, file_ptr);			// header
+
+		ncount += fread(&time_stamp, sizeof(double), 1, file_ptr);	// time stamp
+
+																	// slice name //
+		ncount += fread(&slice_name_length, sizeof(int), 1, file_ptr);
+
+		slice_name = new char[slice_name_length + 1];
+		ncount += fread(slice_name, sizeof(char), slice_name_length, file_ptr);
+		slice_name[slice_name_length] = '\0';
+
+		int _slice_size = header[5];
+		int _psum_num = header[2];
+		int _slice_ptr = header[3];
+
+		double* buffer;
+		allocate(&buffer, _slice_size);
+
+		// read partial sums //
+		for (int k = 0; k < _psum_num; k++)
+			ncount += fread(buffer, sizeof(double), _slice_size, file_ptr);
+
+#ifndef EXCLUDE_SLICE_LIST
+		// read current elements in list //
+		for (int k = 0; k < _slice_ptr; k++)
+			ncount += fread(buffer, sizeof(double), _slice_size, file_ptr);
+#endif
+
+		// read current partial sum //
+		ncount += fread(buffer, sizeof(double), _slice_size, file_ptr);
+
+		// read time stamps //
+		double time_stamp[2];
+		ncount += fread(time_stamp, sizeof(double), 2, file_ptr);
+		if (_psum_num > 0) {
+			double *time_buffer[2];
+			allocate(&time_buffer[0], &time_buffer[1], _psum_num);
+
+			ncount += fread(time_buffer[0], sizeof(double), _psum_num, file_ptr);
+			ncount += fread(time_buffer[1], sizeof(double), _psum_num, file_ptr);
+
+			deallocate(time_buffer[0], time_buffer[1]);
+		}
+
+		deallocate(buffer);
+		delete[] slice_name;
+
+		if (ncount !=
+			(10 + slice_name_length + 2 * _psum_num +
+#ifndef EXCLUDE_SLICE_LIST
+			(_psum_num + _slice_ptr + 1) * _slice_size))
+#else
+				(_psum_num + 1) * _slice_size))
+#endif
+		{
+			fclose(file_ptr);
+			return false;
+		}
+	}
+
+	// read header //
+	ncount = fread(header, sizeof(int), 6, file_ptr);
+
+	set(header[5],	// size of element(slice)
+		header[0],	// size of partial sum
+		header[1]);	// max number of available psums
+
+	psum_num = header[2];	// number of current psums
+	slice_ptr = header[3];	// current number of elements in working list
+	slice_num = header[4];	// full number of elements(slices)
+
+							// time stamp //
+	ncount += fread(&time_stamp, sizeof(double), 1, file_ptr);
+
+	// slice name //
+	ncount += fread(&slice_name_length, sizeof(int), 1, file_ptr);
+
+	slice_name = new char[slice_name_length + 1];
+	ncount += fread(slice_name, sizeof(char), slice_name_length, file_ptr);
+	slice_name[slice_name_length] = '\0';
+
+	// read partial sums //
+	for (int k = 0; k < psum_num; k++)
+		ncount += fread(psum_list[k], sizeof(double), slice_size, file_ptr);
+
+#ifndef EXCLUDE_SLICE_LIST
+	// read current elements in list //
+	for (int k = 0; k < slice_ptr; k++)
+		ncount += fread(slice_list[k], sizeof(double), slice_size, file_ptr);
+#endif
+
+	// read current partial sum //
+	ncount += fread(slice_sum, sizeof(double), slice_size, file_ptr);
+
+	// read time stamps //
+	ncount += fread(slice_list_time, sizeof(double), 2, file_ptr);
+	if (psum_num > 0) {
+		ncount += fread(psum_time[0], sizeof(double), psum_num, file_ptr);
+		ncount += fread(psum_time[1], sizeof(double), psum_num, file_ptr);
+	}
+
+	delete[] slice_name;
+	fclose(file_ptr);
+	return (ncount ==
+		(10 + slice_name_length + 2 * psum_num +
+#ifndef EXCLUDE_SLICE_LIST
+		(psum_num + slice_ptr + 1) * slice_size));
+#else
+			(psum_num + 1) * slice_size));
+#endif
+}
diff --git a/time-slice.h b/time-slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..1204c90a9330d5fe28ae0c93e8bc7bcec0a123f8
--- /dev/null
+++ b/time-slice.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "nse-sys.h"
+#include "nse-alloc.h"
+
+#include <string>
+
+#define EXCLUDE_SLICE_LIST		// exclude list of elements in timeSlice
+// keeping only current partial sum
+
+// * Time-Slice [Running time average] * //
+namespace nse
+{
+	class timeSlice {
+	public:
+
+		bool set(const int slice_size);
+		bool set(const int slice_size, const int psum_size);
+		bool set(const int slice_size, const int psum_size,
+			const int psum_num_max);
+
+		int get_size() const { return slice_size; }
+		int get_num() const { return slice_num; }
+
+		template< memType mem = memCPU, typename T1, typename T2 >	// add element to list 
+		bool push(const T1* const X, const T2 time);
+
+		template< typename T >							// current average
+		void average(T* X) const;
+		template< typename T >
+		void average(T* X, const T time_begin, const T time_end) const;
+
+		// file-I/O //
+		bool write(const std::string& filename, const char* mode,	// mode: "wb" || "ab"
+			const char* name, const double time) const;
+		bool read(const std::string& filename,
+			const int obj_offset);	// offset: number of [timeSlice] objects //
+
+		void clear();
+
+	public:
+		timeSlice();
+		~timeSlice();
+
+	private:
+
+		// control parameters //
+		static const int psum_size_default = 512;	// each psum contains [psum_size] elements
+		static const int psum_num_init = 64;		// number of psums at init
+		static const int psum_num_add = 64;			// number of psums to add on demand
+		// ---------------------------------------------------------------------------- //
+
+	public:
+		// partial sum size //
+		int psum_size;					// using default if not set
+
+										// partial sums data //
+		int psum_num_max;				// maximum number of availabe psums
+		int psum_num;					// number of current psums
+		double **psum_list;				// list of psums 
+
+										// current working list of slices //
+		int slice_ptr;					// current number of elements in working list
+#ifndef EXCLUDE_SLICE_LIST
+		double **slice_list;			// current list of elements
+#endif
+		double *slice_sum;				// sum of current list
+
+										// general slice parameters //
+		int slice_num;					// full number of elements(slices) = psum_num * psum_size + slice_ptr	 
+		int slice_size;					// size of element(slice)
+
+										// time stamps //
+		double slice_list_time[2];		// time stamp[begin-end] for current list
+		double *psum_time[2];			// time stamp[begin-end] for partial sums
+	};
+}
diff --git a/time-slice3d.cpp b/time-slice3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a4b53358c84c318ae06078ffc492de9146263af
--- /dev/null
+++ b/time-slice3d.cpp
@@ -0,0 +1,584 @@
+#include "time-slice3d.h"
+#include "str-com.h"
+
+
+// * gather-scatter grid-slice data * //
+template< typename T >
+void nse::mpi_gather(timeSlice& out, const timeSlice& in, const int host,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	int mpi_slice_size = grid.mpi_dim_size(axis);
+
+	if (grid.mpi_com.rank == host) {
+		out.set(mpi_slice_size,
+			in.psum_size, in.psum_num_max);
+
+		out.psum_num = in.psum_num;
+		out.slice_ptr = in.slice_ptr;
+		out.slice_num = in.slice_num;
+
+		for (int k = 0; k < in.psum_num; k++)
+			grid.mpi_gather(out.psum_list[k], in.psum_list[k], host, axis);
+
+#ifndef EXCLUDE_SLICE_LIST
+		for (int k = 0; k < in.slice_ptr; k++)
+			grid.mpi_gather(out.slice_list[k], in.slice_list[k], host, axis);
+#endif
+
+		grid.mpi_gather(out.slice_sum, in.slice_sum, host, axis);
+
+		// time stamp - using local values at host
+		mcopy(out.slice_list_time, in.slice_list_time, 2);
+		if (out.psum_num > 0) {
+			mcopy(out.psum_time[0], in.psum_time[0], out.psum_num);
+			mcopy(out.psum_time[1], in.psum_time[1], out.psum_num);
+		}
+	}
+	else
+	{
+		for (int k = 0; k < in.psum_num; k++)
+			grid.mpi_gather((double*)NULL, in.psum_list[k], host, axis);
+
+#ifndef EXCLUDE_SLICE_LIST
+		for (int k = 0; k < in.slice_ptr; k++)
+			grid.mpi_gather((double*)NULL, in.slice_list[k], host, axis);
+#endif
+
+		grid.mpi_gather((double*)NULL, in.slice_sum, host, axis);
+	}
+}
+
+template< typename T >
+void nse::mpi_scatter(timeSlice& out, const timeSlice& in, const int host,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	int slice_size = grid.dim_size(axis);
+
+	// define in:timeSlice parameters on all processors
+	int in_setup[5];
+	if (grid.mpi_com.rank == host) {
+		in_setup[0] = in.psum_size;
+		in_setup[1] = in.psum_num_max;
+		in_setup[2] = in.psum_num;
+		in_setup[3] = in.slice_ptr;
+		in_setup[4] = in.slice_num;
+	}
+	mpi_broadcast(in_setup, 5, host, grid.mpi_com.comm);
+
+	int in_psum_size, in_psum_num_max,
+		in_psum_num, in_slice_ptr, in_slice_num;
+
+	in_psum_size = in_setup[0];
+	in_psum_num_max = in_setup[1];
+	in_psum_num = in_setup[2];
+	in_slice_ptr = in_setup[3];
+	in_slice_num = in_setup[4];
+
+	out.set(slice_size,
+		in_psum_size, in_psum_num_max);
+
+	out.psum_num = in_psum_num;
+	out.slice_ptr = in_slice_ptr;
+	out.slice_num = in_slice_num;
+
+	if (grid.mpi_com.rank == host) {
+		for (int k = 0; k < out.psum_num; k++)
+			grid.mpi_scatter(out.psum_list[k], in.psum_list[k], host, axis);
+
+#ifndef EXCLUDE_SLICE_LIST
+		for (int k = 0; k < out.slice_ptr; k++)
+			grid.mpi_scatter(out.slice_list[k], in.slice_list[k], host, axis);
+#endif
+
+		grid.mpi_scatter(out.slice_sum, in.slice_sum, host, axis);
+	}
+	else
+	{
+		for (int k = 0; k < out.psum_num; k++)
+			grid.mpi_scatter(out.psum_list[k], (double*)NULL, host, axis);
+
+#ifndef EXCLUDE_SLICE_LIST
+		for (int k = 0; k < out.slice_ptr; k++)
+			grid.mpi_scatter(out.slice_list[k], (double*)NULL, host, axis);
+#endif
+
+		grid.mpi_scatter(out.slice_sum, (double*)NULL, host, axis);
+	}
+
+	// copy time stamp at host & broadcast
+	if (grid.mpi_com.rank == host) {
+		mcopy(out.slice_list_time, in.slice_list_time, 2);
+		if (out.psum_num > 0) {
+			mcopy(out.psum_time[0], in.psum_time[0], out.psum_num);
+			mcopy(out.psum_time[1], in.psum_time[1], out.psum_num);
+		}
+	}
+	mpi_broadcast(out.slice_list_time, 2, host, grid.mpi_com.comm);
+	if (out.psum_num > 0) {
+		mpi_broadcast(out.psum_time[0], out.psum_num, host, grid.mpi_com.comm);
+		mpi_broadcast(out.psum_time[1], out.psum_num, host, grid.mpi_com.comm);
+	}
+}
+// ------------------------------------------------------------------------ //
+
+// * write binary slice * //
+template< typename T >
+bool nse::write_binary(const std::string& filename,
+	const timeSlice& in, const char* name,
+
+	const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time)
+{
+	// checking if slice size eq. grid dimension
+	int status = (grid.dim_size(axis) == in.slice_size);
+	mpi_allreduce(&status, MPI_MIN, grid.mpi_com.comm);
+	if (status == 0) return false;
+
+	timeSlice out;
+	mpi_gather(out, in, 0, axis, grid);
+
+	status = 1;
+	if (grid.mpi_com.rank == 0) {
+		if (!out.write(filename, "wb", name, (double)time)) status = 0;
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< typename T >
+bool nse::write_binary(const std::string& filename,
+	const timeSlice& xin, const timeSlice& yin,
+	const char* xname, const char* yname,
+
+	const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time)
+{
+	// checking if slice size eq. grid dimension
+	int status = (
+		(grid.dim_size(axis) == xin.slice_size) &&
+		(grid.dim_size(axis) == yin.slice_size));
+	mpi_allreduce(&status, MPI_MIN, grid.mpi_com.comm);
+	if (status == 0) return false;
+
+	timeSlice xout, yout;
+	mpi_gather(xout, xin, 0, axis, grid);
+	mpi_gather(yout, yin, 0, axis, grid);
+
+	status = 1;
+	if (grid.mpi_com.rank == 0) {
+		if (!xout.write(filename, "wb", xname, (double)time)) status = 0;
+		if (!yout.write(filename, "ab", yname, (double)time)) status = 0;
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< typename T >
+bool nse::write_binary(const std::string& filename,
+	const timeSlice& uin, const timeSlice& vin, const timeSlice& win,
+	const char* uname, const char* vname, const char* wname,
+
+	const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time)
+{
+	// checking if slice size eq. grid dimension
+	int status = (
+		(grid.dim_size(axis) == uin.slice_size) &&
+		(grid.dim_size(axis) == vin.slice_size) &&
+		(grid.dim_size(axis) == win.slice_size));
+	mpi_allreduce(&status, MPI_MIN, grid.mpi_com.comm);
+	if (status == 0) return false;
+
+	timeSlice uout, vout, wout;
+	mpi_gather(uout, uin, 0, axis, grid);
+	mpi_gather(vout, vin, 0, axis, grid);
+	mpi_gather(wout, win, 0, axis, grid);
+
+	status = 1;
+	if (grid.mpi_com.rank == 0) {
+		if (!uout.write(filename, "wb", uname, (double)time)) status = 0;
+		if (!vout.write(filename, "ab", vname, (double)time)) status = 0;
+		if (!wout.write(filename, "ab", wname, (double)time)) status = 0;
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< typename T >
+bool nse::write_binary(const std::string& filename,
+	const timeSlice& xin, const timeSlice& yin, 
+	const timeSlice& uin, const timeSlice& vin,
+	const char* xname, const char* yname, 
+	const char* uname, const char* vname,
+
+	const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time)
+{
+	// checking if slice size eq. grid dimension
+	int status = (
+		(grid.dim_size(axis) == xin.slice_size) &&
+		(grid.dim_size(axis) == yin.slice_size) &&
+		(grid.dim_size(axis) == uin.slice_size) &&
+		(grid.dim_size(axis) == vin.slice_size));
+	mpi_allreduce(&status, MPI_MIN, grid.mpi_com.comm);
+	if (status == 0) return false;
+
+	timeSlice xout, yout, uout, vout;
+	mpi_gather(xout, xin, 0, axis, grid);
+	mpi_gather(yout, yin, 0, axis, grid);
+	mpi_gather(uout, uin, 0, axis, grid);
+	mpi_gather(vout, vin, 0, axis, grid);
+
+	status = 1;
+	if (grid.mpi_com.rank == 0) {
+		if (!xout.write(filename, "wb", xname, (double)time)) status = 0;
+		if (!yout.write(filename, "ab", yname, (double)time)) status = 0;
+		if (!uout.write(filename, "ab", uname, (double)time)) status = 0;
+		if (!vout.write(filename, "ab", vname, (double)time)) status = 0;
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	return (status == 1);
+}
+
+template< typename T >
+bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& in, const char* name,
+
+	const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time)
+{
+	return write_binary(append_index(filename, idx),
+		in, name, axis, grid, time);
+}
+
+template< typename T >
+bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& xin, const timeSlice& yin,
+	const char* xname, const char* yname,
+
+	const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time)
+{
+	return write_binary(append_index(filename, idx),
+		xin, yin, xname, yname, axis, grid, time);
+}
+
+template< typename T >
+bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& uin, const timeSlice& vin, const timeSlice& win,
+	const char* uname, const char* vname, const char* wname,
+
+	const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time)
+{
+	return write_binary(append_index(filename, idx),
+		uin, vin, win, uname, vname, wname, axis, grid, time);
+}
+
+template< typename T >
+bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& xin, const timeSlice& yin, 
+	const timeSlice& uin, const timeSlice& vin,
+	const char* xname, const char* yname, 
+	const char* uname, const char* vname,
+
+	const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time)
+{
+	return write_binary(append_index(filename, idx),
+		xin, yin, uin, vin, xname, yname, uname, vname, axis, grid, time);
+}
+// ------------------------------------------------------------------------ //
+
+// * read binary slice * //
+template< typename T >
+bool nse::read_binary(const std::string& filename,
+	timeSlice& out,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	timeSlice in;
+
+	int status = 1;
+	if (grid.mpi_com.rank == 0) {
+		if (!in.read(filename, 0)) status = 0;
+
+		if (grid.mpi_dim_size(axis) != in.slice_size) status = 0;
+	}
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	if (status == 1) {
+		mpi_scatter(out, in, 0, axis, grid);
+	}
+
+	return (status == 1);
+}
+
+template< typename T >
+bool nse::read_binary(const std::string& filename,
+	timeSlice& xout, timeSlice& yout,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	timeSlice xin, yin;
+
+	int status = 1;
+	if (grid.mpi_com.rank == 0) {
+		if (!xin.read(filename, 0)) status = 0;
+		if (!yin.read(filename, 1)) status = 0;
+
+		if (grid.mpi_dim_size(axis) != xin.slice_size) status = 0;
+		if (grid.mpi_dim_size(axis) != yin.slice_size) status = 0;
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	if (status == 1) {
+		mpi_scatter(xout, xin, 0, axis, grid);
+		mpi_scatter(yout, yin, 0, axis, grid);
+	}
+
+	return (status == 1);
+}
+
+template< typename T >
+bool nse::read_binary(const std::string& filename,
+	timeSlice& uout, timeSlice& vout, timeSlice& wout,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	timeSlice uin, vin, win;
+
+	int status = 1;
+	if (grid.mpi_com.rank == 0) {
+		if (!uin.read(filename, 0)) status = 0;
+		if (!vin.read(filename, 1)) status = 0;
+		if (!win.read(filename, 2)) status = 0;
+
+		if (grid.mpi_dim_size(axis) != uin.slice_size) status = 0;
+		if (grid.mpi_dim_size(axis) != vin.slice_size) status = 0;
+		if (grid.mpi_dim_size(axis) != win.slice_size) status = 0;
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	if (status == 1) {
+		mpi_scatter(uout, uin, 0, axis, grid);
+		mpi_scatter(vout, vin, 0, axis, grid);
+		mpi_scatter(wout, win, 0, axis, grid);
+	}
+
+	return (status == 1);
+}
+
+template< typename T >
+bool nse::read_binary(const std::string& filename,
+	timeSlice& xout, timeSlice& yout, 
+	timeSlice& uout, timeSlice& vout,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	timeSlice xin, yin, uin, vin;
+
+	int status = 1;
+	if (grid.mpi_com.rank == 0) {
+		if (!xin.read(filename, 0)) status = 0;
+		if (!yin.read(filename, 1)) status = 0;
+		if (!uin.read(filename, 2)) status = 0;
+		if (!vin.read(filename, 3)) status = 0;
+
+		if (grid.mpi_dim_size(axis) != xin.slice_size) status = 0;
+		if (grid.mpi_dim_size(axis) != yin.slice_size) status = 0;
+		if (grid.mpi_dim_size(axis) != uin.slice_size) status = 0;
+		if (grid.mpi_dim_size(axis) != vin.slice_size) status = 0;
+	}
+
+	MPI_Bcast(&status, 1, MPI_INT, 0, grid.mpi_com.comm);
+	if (status == 1) {
+		mpi_scatter(xout, xin, 0, axis, grid);
+		mpi_scatter(yout, yin, 0, axis, grid);
+		mpi_scatter(uout, uin, 0, axis, grid);
+		mpi_scatter(vout, vin, 0, axis, grid);
+	}
+
+	return (status == 1);
+}
+
+template< typename T >
+bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& out,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	return read_binary(append_index(filename, idx),
+		out, axis, grid);
+}
+
+template< typename T >
+bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& xout, timeSlice& yout,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	return read_binary(append_index(filename, idx),
+		xout, yout, axis, grid);
+}
+
+template< typename T >
+bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& uout, timeSlice& vout, timeSlice& wout,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	return read_binary(append_index(filename, idx),
+		uout, vout, wout, axis, grid);
+}
+
+template< typename T >
+bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& xout, timeSlice& yout, 
+	timeSlice& uout, timeSlice& vout,
+	const nse_const3d::axisType axis, const Grid3d< T >& grid)
+{
+	return read_binary(append_index(filename, idx),
+		xout, yout, uout, vout, axis, grid);
+}
+// ------------------------------------------------------------------------ //
+
+
+// ------------------------------------------------------------------------ //
+// ------------------------------------------------------------------------ //
+
+
+// * initialize: gather-scatter grid-slice * //
+template void nse::mpi_gather(timeSlice& out, const timeSlice& in, const int host,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template void nse::mpi_gather(timeSlice& out, const timeSlice& in, const int host,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template void nse::mpi_scatter(timeSlice& out, const timeSlice& in, const int host,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template void nse::mpi_scatter(timeSlice& out, const timeSlice& in, const int host,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+// * intialize: write slice binary output * //
+template bool nse::write_binary(const std::string& filename,
+	const timeSlice& xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary(const std::string& filename,
+	const timeSlice& xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary(const std::string& filename,
+	const timeSlice& xin, const timeSlice& yin,
+	const char* xname, const char* yname,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary(const std::string& filename,
+	const timeSlice& xin, const timeSlice& yin,
+	const char* xname, const char* yname,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary(const std::string& filename,
+	const timeSlice& uin, const timeSlice& vin, const timeSlice& win,
+	const char* uname, const char* vname, const char* wname,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary(const std::string& filename,
+	const timeSlice& uin, const timeSlice& vin, const timeSlice& win,
+	const char* uname, const char* vname, const char* wname,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary(const std::string& filename,
+	const timeSlice& xin, const timeSlice& yin,
+	const timeSlice& uin, const timeSlice& vin,
+	const char* xname, const char* yname,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary(const std::string& filename,
+	const timeSlice& xin, const timeSlice& yin,
+	const timeSlice& uin, const timeSlice& vin,
+	const char* xname, const char* yname,
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& xin, const char* name,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& xin, const timeSlice& yin,
+	const char* xname, const char* yname,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& xin, const timeSlice& yin,
+	const char* xname, const char* yname,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& uin, const timeSlice& vin, const timeSlice& win,
+	const char* uname, const char* vname, const char* wname,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& uin, const timeSlice& vin, const timeSlice& win,
+	const char* uname, const char* vname, const char* wname,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& xin, const timeSlice& yin, 
+	const timeSlice& uin, const timeSlice& vin,
+	const char* xname, const char* yname, 
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid, const float time);
+template bool nse::write_binary(const std::string& filename, const int idx,
+	const timeSlice& xin, const timeSlice& yin, 
+	const timeSlice& uin, const timeSlice& vin,
+	const char* xname, const char* yname, 
+	const char* uname, const char* vname,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid, const double time);
+// ------------------------------------------------------------------------ //
+
+// * intialize: read slice binary output * //
+template bool nse::read_binary(const std::string& filename,
+	timeSlice& xout,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary(const std::string& filename,
+	timeSlice& xout,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary(const std::string& filename,
+	timeSlice& xout, timeSlice& yout,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary(const std::string& filename,
+	timeSlice& xout, timeSlice& yout,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary(const std::string& filename,
+	timeSlice& uout, timeSlice& vout, timeSlice& wout,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary(const std::string& filename,
+	timeSlice& uout, timeSlice& vout, timeSlice& wout,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary(const std::string& filename,
+	timeSlice& xout, timeSlice& yout, timeSlice& uout, timeSlice& vout,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary(const std::string& filename,
+	timeSlice& xout, timeSlice& yout, timeSlice& uout, timeSlice& vout,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& xout,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& xout,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& xout, timeSlice& yout,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& xout, timeSlice& yout,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& uout, timeSlice& vout, timeSlice& wout,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& uout, timeSlice& vout, timeSlice& wout,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+
+template bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& xout, timeSlice& yout, timeSlice& uout, timeSlice& vout,
+	const nse_const3d::axisType axis, const Grid3d< float >& grid);
+template bool nse::read_binary(const std::string& filename, const int idx,
+	timeSlice& xout, timeSlice& yout, timeSlice& uout, timeSlice& vout,
+	const nse_const3d::axisType axis, const Grid3d< double >& grid);
+// ------------------------------------------------------------------------ //
diff --git a/time-slice3d.h b/time-slice3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ba44779be5f58e7bf374c87b4fe61b39cf99402
--- /dev/null
+++ b/time-slice3d.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include "grid3d.h"
+#include "time-slice.h"
+
+// * Time-Slice-3D [Running time average] * //
+// * Req.: slice corresponds to grid strip * //
+namespace nse
+{
+	// * gather-scatter grid-slice data * //
+	template< typename T >
+	void mpi_gather(timeSlice& out, const timeSlice& in, const int host,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+	template< typename T >
+	void mpi_scatter(timeSlice& out, const timeSlice& in, const int host,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+
+
+	// * write slice to binary output * //
+	template< typename T >
+	bool write_binary(const std::string& filename,
+		const timeSlice& xin, const char* name,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time);
+	template< typename T >
+	bool write_binary(const std::string& filename,
+		const timeSlice& xin, const timeSlice& yin, 
+		const char* xname, const char* yname,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time);
+	template< typename T >
+	bool write_binary(const std::string& filename,
+		const timeSlice& uin, const timeSlice& vin, const timeSlice& win,
+		const char* uname, const char* vname, const char* wname,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time);
+	template< typename T >
+	bool write_binary(const std::string& filename,
+		const timeSlice& xin, const timeSlice& yin,
+		const timeSlice& uin, const timeSlice& vin,
+		const char* xname, const char* yname, 
+		const char* uname, const char* vname,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time);
+
+	template< typename T >
+	bool write_binary(const std::string& filename, const int idx,
+		const timeSlice& xin, const char* name,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time);
+	template< typename T >
+	bool write_binary(const std::string& filename, const int idx,
+		const timeSlice& xin, const timeSlice& yin,
+		const char* xname, const char* yname,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time);
+	template< typename T >
+	bool write_binary(const std::string& filename, const int idx,
+		const timeSlice& uin, const timeSlice& vin, const timeSlice& win,
+		const char* uname, const char* vname, const char* wname,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time);
+	template< typename T >
+	bool write_binary(const std::string& filename, const int idx,
+		const timeSlice& xin, const timeSlice& yin, 
+		const timeSlice& uin, const timeSlice& vin,
+		const char* xname, const char* yname, 
+		const char* uname, const char* vname,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid, const T time);
+	// -------------------------------------------------------------------- //
+
+	// * read slice from binary output * //
+	template< typename T >
+	bool read_binary(const std::string& filename,
+		timeSlice& xout, 
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+	template< typename T >
+	bool read_binary(const std::string& filename,
+		timeSlice& xout, timeSlice& yout,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+	template< typename T >
+	bool read_binary(const std::string& filename,
+		timeSlice& uout, timeSlice& vout, timeSlice& wout,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+	template< typename T >
+	bool read_binary(const std::string& filename,
+		timeSlice& xout, timeSlice& yout, 
+		timeSlice& uout, timeSlice& vout,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+
+	template< typename T >
+	bool read_binary(const std::string& filename, const int idx,
+		timeSlice& xout, 
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+	template< typename T >
+	bool read_binary(const std::string& filename, const int idx,
+		timeSlice& xout, timeSlice& yout,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+	template< typename T >
+	bool read_binary(const std::string& filename, const int idx,
+		timeSlice& uout, timeSlice& vout, timeSlice& wout,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+	template< typename T >
+	bool read_binary(const std::string& filename, const int idx,
+		timeSlice& xout, timeSlice& yout, 
+		timeSlice& uout, timeSlice& vout,
+		const nse_const3d::axisType axis, const Grid3d< T >& grid);
+	// -------------------------------------------------------------------- //
+}
diff --git a/traj-accum3d.cpp b/traj-accum3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e72e864d2ade782ccb32c99b59cff7f4696059c
--- /dev/null
+++ b/traj-accum3d.cpp
@@ -0,0 +1,620 @@
+#define _CRT_SECURE_NO_WARNINGS
+#include "traj-accum3d.h"
+#include "traj-file-handle.h"
+
+#include "nse-alloc.h"
+#include "vecmath.h"
+#include "str-com.h"
+
+
+// Implementation [trajAccum3d]
+// ----------------------------------------------------------------------------
+template< typename T >
+nse::trajAccum3d< T >::trajAccum3d() :
+	size(0), mem_size(0)
+{
+	max_memory = max_memory_default;
+	group_max_size = group_max_size_default;
+
+	work_dir = work_dir_default;
+	work_filename = work_dir + work_base_filename;
+}
+
+template< typename T >
+nse::trajAccum3d< T >::~trajAccum3d()
+{
+	if (mem_size > 0) {
+		delete[] list;
+		deallocate(is_active, is_updated);
+
+		mem_size = 0;
+	}
+
+	size = 0;
+}
+// ----------------------------------------------------------------------------
+
+// set calls
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::trajAccum3d< T >::set_work_directory(const std::string& dir)
+{
+	if (groups.n > 0) return false;
+
+	work_dir = dir;
+	work_filename = work_dir + work_base_filename;
+	return true;
+}
+
+template< typename T >
+bool nse::trajAccum3d< T >::set_group_max_size(const int _group_max_size)
+{
+	if (groups.n > 0) return false;
+	if (_group_max_size <= 0) return false;
+
+	group_max_size = _group_max_size;
+	return true;
+}
+
+template< typename T >
+bool nse::trajAccum3d< T >::set_max_memory_usage(const int _max_memory)
+{
+	if (_max_memory <= 0) return false;
+
+	max_memory = _max_memory;
+	return true;
+}
+// ----------------------------------------------------------------------------
+
+// update trajectory with particles vector
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::trajAccum3d< T >::update(const ptclTrackVec3d< T >& pvec,
+	const T time, const Grid3d< T >& grid)
+{
+	int m, k;
+	bool add_element;
+	GroupList add_groups;
+
+	assign(is_updated, false, size);
+
+	for (m = 0; m < pvec.n; m++)
+	{
+		add_element = true;
+		for (k = 0; k < size; k++)
+		{
+			if ((is_active[k]) && (pvec.tag[m] == list[k].tag)) {
+				list[k].push_back(pvec.x[m], pvec.y[m], pvec.z[m], time);
+				add_element = false;
+				is_updated[k] = true;
+				break;
+			}
+		}
+
+		if (add_element)	// adding new element to list
+		{
+			resize(size + 1);
+
+			list[size].tag = pvec.tag[m];
+			list[size].group = (list[size].tag / group_max_size) + 1;
+			list[size].push_back(pvec.x[m], pvec.y[m], pvec.z[m], time);
+
+			is_active[size] = true;
+			is_updated[size] = true;
+
+			if (!groups.is_element(list[size].group)) {
+				add_groups.add_if_unique(list[size].group);
+			}
+
+			size++;
+		}
+	}
+
+	// set active flag for updated and new trajectories
+	for (k = 0; k < size; k++)
+		is_active[k] = is_active[k] && is_updated[k];
+
+
+	// group list synchronization
+	add_groups.mpi_gather_unique(0, grid);
+	add_groups.mpi_broadcast(0, grid);
+
+	// creating working directory if necessary
+	if ((groups.n == 0) && (add_groups.n > 0)) {
+		bool status = true;
+		if (grid.mpi_com.rank == 0) status = create_dir(work_dir);
+
+		mpi_broadcast(&status, 1, 0, grid.mpi_com.comm);
+		if (!status) return false;
+	}
+
+	// creating files for new groups
+	std::string filename;
+	int io_status = 0;
+	for (int k = 0; k < add_groups.n; k++) {
+
+		// mapping groups to MPI processes
+		if (add_groups.id[k] % grid.mpi_com.size == grid.mpi_com.rank)
+		{
+			filename = append_index(work_filename, "-gp-", add_groups.id[k]);
+			if (!traj3d< T >::create_dump_file(filename, add_groups.id[k])) {
+				io_status++;
+				break;
+			}
+		}
+	}
+	if (add_groups.n > 0) {
+		mpi_allreduce(&io_status, MPI_SUM, grid.mpi_com.comm);
+		if (io_status > 0) return false;
+	}
+
+	// adding new groups
+	groups.add(add_groups);
+
+	// moving data to filesystem
+	if (!dump_to_filesystem(max_memory, grid)) return false;
+
+	return true;
+}
+// ----------------------------------------------------------------------------
+
+// tecplot output
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::trajAccum3d< T >::write_tecplot(
+	const std::string& filename,
+	const Grid3d< T >& grid)
+{
+	// moving all data to filesystem
+	if (!dump_to_filesystem(0, grid)) return false;
+
+	trajFileHandle< T > fhandle;
+	std::string data_filename, tecplot_filename;
+
+	int io_status = 0;
+	for (int k = 0; k < groups.n; k++) {
+
+		// mapping groups to MPI processes
+		if (groups.id[k] % grid.mpi_com.size == grid.mpi_com.rank)
+		{
+			data_filename = append_index(work_filename, "-gp-", groups.id[k]);
+			tecplot_filename = append_index(filename, "-gp-", groups.id[k]);
+
+			if (!fhandle.convert_to_tecplot(tecplot_filename, data_filename)) {
+				io_status++;
+				break;
+			}
+		}
+	}
+	if (groups.n > 0) {
+		mpi_allreduce(&io_status, MPI_SUM, grid.mpi_com.comm);
+		if (io_status > 0) return false;
+	}
+
+	return true;
+}
+
+template< typename T >
+bool nse::trajAccum3d< T >::write_tecplot(
+	const std::string& filename, const int idx,
+	const Grid3d< T >& grid)
+{
+	return write_tecplot(append_index(filename, idx), grid);
+}
+// ----------------------------------------------------------------------------
+
+// binary output
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::trajAccum3d< T >::write_binary(
+	const std::string& filename, const Grid3d< T >& grid)
+{
+	// moving all data to filesystem
+	if (!dump_to_filesystem(0, grid)) return false;
+
+	trajFileHandle< T > fhandle;
+	std::string data_filename, data_formatted_filename;
+
+	int io_status = 0;
+	for (int k = 0; k < groups.n; k++) {
+
+		// mapping groups to MPI processes
+		if (groups.id[k] % grid.mpi_com.size == grid.mpi_com.rank)
+		{
+			data_filename = append_index(work_filename, "-gp-", groups.id[k]);
+			data_formatted_filename = append_index(filename, "-gp-", groups.id[k]);
+
+			if (!fhandle.convert_to_formatted_binary(
+				data_formatted_filename, data_filename))
+			{
+				io_status++;
+				break;
+			}
+		}
+	}
+	if (groups.n > 0) {
+		mpi_allreduce(&io_status, MPI_SUM, grid.mpi_com.comm);
+		if (io_status > 0) return false;
+	}
+
+	// writing data descriptor ...
+	bool status = true;
+	if (grid.mpi_com.rank == 0) {
+		std::string desc_filename = append_string(filename, "-desc-");
+
+		FILE *desc_ptr = fopen(desc_filename.c_str(), "wb");
+		if (desc_ptr == NULL) status = false;
+		else
+		{
+			int desc_header[5] = { 't' + 'r' + 'j',		// id
+				3,					// number of dimensions
+				sizeof(T),			// type size in bytes
+				group_max_size,		// max group size
+				groups.n			// number of active groups
+			};
+
+			fwrite(desc_header, sizeof(int), 5, desc_ptr);
+			fwrite(groups.id, sizeof(int), groups.n, desc_ptr);
+
+			fclose(desc_ptr);
+		}
+	}
+
+	mpi_broadcast(&status, 1, 0, grid.mpi_com.comm);
+	return status;
+}
+
+template< typename T >
+bool nse::trajAccum3d< T >::write_binary(
+	const std::string& filename, const int idx,
+	const Grid3d< T >& grid)
+{
+	return write_binary(append_index(filename, idx), grid);
+}
+// ----------------------------------------------------------------------------
+
+// binary input
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::trajAccum3d< T >::read_binary(
+	const std::string& filename, const Grid3d< T >& grid)
+{
+	if (!cleanup(grid)) return false;
+
+	// reading descriptor data ...
+	bool status = true;
+	if (grid.mpi_com.rank == 0) {
+		std::string desc_filename = append_string(filename, "-desc-");
+
+		FILE *desc_ptr = fopen(desc_filename.c_str(), "rb");
+		if (desc_ptr == NULL) status = false;
+		else
+		{
+			int desc_header[5];
+			int nstatus = fread(desc_header, sizeof(int), 5, desc_ptr);
+
+			if ((desc_header[0] == 't' + 'r' + 'j') ||
+				(desc_header[1] == 3) ||
+				(desc_header[2] == sizeof(T)) ||
+				(desc_header[3] > 0) ||
+				(desc_header[4] >= 0))
+			{
+				group_max_size = desc_header[3];
+				groups.resize(desc_header[4]);
+
+				nstatus = fread(groups.id, sizeof(int), desc_header[4], desc_ptr);
+				groups.n = desc_header[4];
+			}
+			else
+				status = false;
+
+			fclose(desc_ptr);
+		}
+	}
+	mpi_broadcast(&status, 1, 0, grid.mpi_com.comm);
+	if (!status) return false;
+
+	mpi_broadcast(&group_max_size, 1, 0, grid.mpi_com.comm);
+	groups.mpi_broadcast(0, grid);
+
+	// creating working directory if necessary
+	if (groups.n > 0) {
+		status = true;
+		if (grid.mpi_com.rank == 0) status = create_dir(work_dir);
+
+		mpi_broadcast(&status, 1, 0, grid.mpi_com.comm);
+		if (!status) return false;
+	}
+
+
+	std::string data_filename, data_formatted_filename;
+
+	int io_status = 0;
+	for (int k = 0; k < groups.n; k++) {
+		// mapping groups to MPI processes
+		if (groups.id[k] % grid.mpi_com.size == grid.mpi_com.rank)
+		{
+			data_formatted_filename = append_index(filename, "-gp-", groups.id[k]);
+			data_filename = append_index(work_filename, "-gp-", groups.id[k]);
+
+			if (!copy_file(data_formatted_filename, data_filename)) {
+				io_status++;
+				break;
+			}
+		}
+	}
+	if (groups.n > 0) {
+		mpi_allreduce(&io_status, MPI_SUM, grid.mpi_com.comm);
+		if (io_status > 0) return false;
+	}
+
+	return true;
+}
+
+template< typename T >
+bool nse::trajAccum3d< T >::read_binary(
+	const std::string& filename, const int idx,
+	const Grid3d< T >& grid)
+{
+	return read_binary(append_index(filename, idx), grid);
+}
+// ----------------------------------------------------------------------------
+
+// cleanup
+// ----------------------------------------------------------------------------
+
+
+template< typename T >
+bool nse::trajAccum3d< T >::cleanup(const Grid3d< T >& grid)
+{
+	std::string filename;
+
+	int io_status = 0;
+	for (int k = 0; k < groups.n; k++) {
+
+		// mapping groups to MPI processes
+		if (groups.id[k] % grid.mpi_com.size == grid.mpi_com.rank)
+		{
+			filename = append_index(work_filename, "-gp-", groups.id[k]);
+			if (remove(filename.c_str())) {
+				io_status++;
+				break;
+			}
+		}
+	}
+	if (groups.n > 0) {
+		mpi_allreduce(&io_status, MPI_SUM, grid.mpi_com.comm);
+		if (io_status > 0) return false;
+	}
+
+	// trying to remove work directory ...
+	if ((groups.n > 0) && (grid.mpi_com.rank == 0))
+		remove_empty_dir(work_dir);
+
+	// clearing data
+	size = 0;
+	groups.n = 0;
+
+	return true;
+}
+// ----------------------------------------------------------------------------
+
+// resize memory for requested number of elements
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::trajAccum3d< T >::resize(const int m)
+{
+	if (m > mem_size)
+	{
+		const int nalloc = max(m, mem_size + mem_alloc);
+		reallocate(&is_active, size, mem_size, nalloc);
+		reallocate(&is_updated, size, mem_size, nalloc);
+
+		traj3d< T > *list_mem;
+		list_mem = new traj3d< T >[nalloc];
+
+		if (size > 0)
+		{
+			for (int k = 0; k < size; k++)
+				list_mem[k] = list[k];
+		}
+
+		if (mem_size > 0) delete[] list;
+
+		list = list_mem;
+		mem_size = nalloc;
+	}
+}
+// ----------------------------------------------------------------------------
+
+// filesystem connection
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::trajAccum3d< T >::dump_to_filesystem(
+	const int _max_memory, const Grid3d< T >& grid)
+{
+	int lsize = 0;	// list full size
+	for (int k = 0; k < size; k++)
+		lsize += list[k].n;
+
+	int *is_max_memory;
+	allocate_vnull(&is_max_memory, grid.mpi_com.size);
+
+	int status = ((int)(3 * lsize * sizeof(T)) > _max_memory);
+	MPI_Allgather(&status, 1, mpi_type< int >(),
+		is_max_memory, 1, mpi_type< int >(), grid.mpi_com.comm);
+
+	for (int i = 0; i < grid.mpi_com.size; i++) {
+		if (is_max_memory[i])
+		{
+			int io_status = 0;
+			if (i == grid.mpi_com.rank) {
+
+				std::string filename;
+				for (int j = 0; j < size; j++) {
+					filename = append_index(work_filename, "-gp-", list[j].group);
+
+					// redundant check if group is already active
+					if (!groups.is_element(list[j].group)) {
+						io_status++;
+						break;
+					}
+
+					if (!list[j].dump_to_file(filename, false)) {
+						io_status++;
+						break;
+					}
+				}
+			}
+			mpi_broadcast(&io_status, 1, i, grid.mpi_com.comm);
+			if (io_status > 0) {
+				deallocate(is_max_memory);
+				return false;
+			}
+		}
+	}
+
+	deallocate(is_max_memory);
+	return true;
+}
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// Implementation [trajAccum3d::GroupList]
+// ----------------------------------------------------------------------------
+template< typename T >
+nse::trajAccum3d< T >::GroupList::GroupList() :
+	n(0), mem_size(0)
+{
+}
+
+template< typename T >
+nse::trajAccum3d< T >::GroupList::~GroupList()
+{
+	if (mem_size > 0) {
+		deallocate(id);
+
+		mem_size = 0;
+	}
+
+	n = 0;
+}
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::trajAccum3d< T >::GroupList::add(const int _id)
+{
+	resize(n + 1);
+
+	id[n] = _id;
+	n++;
+}
+
+template< typename T >
+void nse::trajAccum3d< T >::GroupList::add(
+	const GroupList& groups)
+{
+	resize(n + groups.n);
+
+	for (int k = 0; k < groups.n; k++) {
+		id[n] = groups.id[k];
+		n++;
+	}
+}
+
+template< typename T >
+void nse::trajAccum3d< T >::GroupList::add_if_unique(const int _id)
+{
+	if (is_element(_id)) return;
+	add(_id);
+}
+
+template< typename T >
+bool nse::trajAccum3d< T >::GroupList::is_element(const int _id) const
+{
+	for (int k = 0; k < n; k++) {
+		if (id[k] == _id) { return true; }
+	}
+
+	return false;
+}
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::trajAccum3d< T >::GroupList::mpi_gather_unique(
+	const int host, const Grid3d< T >& grid)
+{
+	int *buf;
+	int buf_id;
+
+	buf_id = memStx::get_buf(&buf, grid.mpi_com.size);
+	buf[grid.mpi_com.rank] = n;
+
+	nse::mpi_gather(&n, 1, buf, host, grid.mpi_com.comm);
+
+	if (grid.mpi_com.rank == host)
+	{
+		int *rbuf;
+		int rbuf_id;
+		for (int k = 0; k < grid.mpi_com.size; k++) {
+			if (k == host) continue;
+			if (buf[k] > 0) {
+				rbuf_id = memStx::get_buf(&rbuf, buf[k]);
+				MPI_Recv(rbuf, buf[k], mpi_type< int >(), k, 0,
+					grid.mpi_com.comm, MPI_STATUS_IGNORE);
+
+				for (int i = 0; i < buf[k]; i++) {
+					add_if_unique(rbuf[i]);
+				}
+
+				memStx::free_buf(rbuf_id);
+			}
+		}
+	}
+	else
+	{
+		if (n > 0)
+			MPI_Send(id, n, mpi_type<int>(), host, 0, grid.mpi_com.comm);
+	}
+
+	memStx::free_buf(buf_id);
+}
+
+template< typename T >
+void nse::trajAccum3d< T >::GroupList::mpi_broadcast(
+	const int host, const Grid3d< T >& grid)
+{
+	int m = n;
+	nse::mpi_broadcast(&m, 1, host, grid.mpi_com.comm);
+	if (m > 0) {
+		if (grid.mpi_com.rank != host) resize(m);
+
+		nse::mpi_broadcast(id, m, host, grid.mpi_com.comm);
+	}
+	n = m;
+}
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::trajAccum3d< T >::GroupList::resize(const int m)
+{
+	if (m > mem_size)
+	{
+		const int nalloc = max(m, mem_size + mem_alloc);
+		reallocate(&id, n, mem_size, nalloc);
+
+		mem_size = nalloc;
+	}
+}
+// ----------------------------------------------------------------------------
+
+
+// Initialization [trajAccum3d]
+// ----------------------------------------------------------------------------
+template class nse::trajAccum3d<float>;
+template class nse::trajAccum3d<double>;
diff --git a/traj-accum3d.h b/traj-accum3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..18613146ae4dda4443765350f9cfbd378465b228
--- /dev/null
+++ b/traj-accum3d.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include "grid3d.h"
+#include "ptcl-track-vec3d.h"
+#include "traj3d.h"
+
+
+namespace nse
+{
+
+	// 3d trajectory accumulator
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	class trajAccum3d {
+	public:
+
+		// set calls
+		// ----------------------------------------------------------------------------
+		bool set_work_directory(const std::string& dir);
+		bool set_group_max_size(const int group_max_size);
+		bool set_max_memory_usage(const int max_memory);	// in bytes
+
+		// update trajectory with particles vector
+		// ----------------------------------------------------------------------------
+		bool update(const ptclTrackVec3d< T >& pvec,
+			const T time, const Grid3d< T >& grid);
+
+		// tecplot output
+		// ----------------------------------------------------------------------------
+		bool write_tecplot(const std::string& filename,
+			const Grid3d< T >& grid);
+		bool write_tecplot(const std::string& filename, const int idx,
+			const Grid3d< T >& grid);
+
+		// binary output
+		// ----------------------------------------------------------------------------
+		bool write_binary(const std::string& filename,
+			const Grid3d< T >& grid);
+		bool write_binary(const std::string& filename, const int idx,
+			const Grid3d< T >& grid);
+
+		// binary input
+		// ----------------------------------------------------------------------------
+		bool read_binary(const std::string& filename,
+			const Grid3d< T >& grid);
+		bool read_binary(const std::string& filename, const int idx,
+			const Grid3d< T >& grid);
+
+		// cleanup
+		// ----------------------------------------------------------------------------
+		bool cleanup(const Grid3d< T >& grid);	// remove work files and directory 
+												// & clear all data
+
+		// ----------------------------------------------------------------------------
+		trajAccum3d();
+		~trajAccum3d();
+
+	private:
+
+		// resize memory for requested number of elements
+		// ----------------------------------------------------------------------------
+		void resize(const int m);
+
+		// filesystem connection
+		// ----------------------------------------------------------------------------
+		bool dump_to_filesystem(const int max_memory, const Grid3d< T >& grid);
+
+	private:
+		// data
+		// ----------------------------------------------------------------------------
+		int max_memory;
+		static const int max_memory_default = 10 * 1024 * 1024;
+
+		int size;						// list size
+		traj3d< T > *list;				// list of particle trajectories
+		bool *is_active, *is_updated;	// additional trajectories statuses
+
+	private:
+		// list of groups additional data structure
+		// ----------------------------------------------------------------------------
+		int group_max_size;
+		static const int group_max_size_default = 1024;
+
+		struct GroupList {
+			int *id;		// list of group id's
+			int n;			// number of active groups
+
+			void add(const int id);
+			void add(const GroupList& groups);
+			void add_if_unique(const int id);
+
+			bool is_element(const int id) const;
+
+			void mpi_gather_unique(const int host, const Grid3d< T >& grid);
+			void mpi_broadcast(const int host, const Grid3d< T >& grid);
+
+			GroupList();
+			~GroupList();
+
+		public:
+			void resize(const int m);
+
+		private:
+			int mem_size;
+			static const int mem_alloc = 16;	// minimum memory allocation block
+		} groups;
+
+	private:
+		// filesystem data
+		// ----------------------------------------------------------------------------
+		std::string work_dir;
+		std::string work_filename;	// = dir + base
+		static const std::string work_dir_default;
+		static const std::string work_base_filename;
+
+	private:
+		// memory management data
+		// ----------------------------------------------------------------------------
+		int mem_size;
+		static const int mem_alloc = 128;	// minimum memory allocation block
+	};
+	// -------------------------------------------------------------------------------------------- //
+}
+
+template< typename T >
+const std::string nse::trajAccum3d< T >::work_dir_default = "traj-work/";
+template< typename T >
+const std::string nse::trajAccum3d< T >::work_base_filename = "traj-data.psx";
diff --git a/traj-file-handle.cpp b/traj-file-handle.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..57755cf6dfa112977215e7a95695df0fb0971d13
--- /dev/null
+++ b/traj-file-handle.cpp
@@ -0,0 +1,404 @@
+#define _CRT_SECURE_NO_WARNINGS
+#include "traj-file-handle.h"
+
+#include "nse-alloc.h"
+#include "vecmath.h"
+#include "mem-stx.h"
+#include "str-com.h"
+
+
+// Implementation [trajFileHandle]
+// ----------------------------------------------------------------------------
+template< typename T >
+nse::trajFileHandle< T >::trajFileHandle() :
+	nrec(0), mem_size(0)
+{
+}
+
+template< typename T >
+nse::trajFileHandle< T >::trajFileHandle(
+	const trajFileHandle< T >& handle) :
+	nrec(handle.nrec), mem_size(handle.mem_size),
+	ndim(handle.ndim), group(handle.group)
+{
+	if (mem_size > 0) {
+		allocate(&tag, &length, &offset, mem_size);
+		allocate(&start_time, &end_time, mem_size);
+	}
+	
+	if (nrec > 0) {
+		mcopy(tag, handle.tag, nrec);
+		mcopy(length, handle.length, nrec);
+		mcopy(offset, handle.offset, nrec);
+
+		mcopy(start_time, handle.start_time, nrec);
+		mcopy(end_time, handle.end_time, nrec);
+	}
+}
+
+template< typename T >
+nse::trajFileHandle< T >::~trajFileHandle()
+{
+	if (mem_size > 0) {
+		deallocate(tag, length, offset);
+		deallocate(start_time, end_time);
+
+		mem_size = 0;
+	}
+
+	nrec = 0;
+}
+// ----------------------------------------------------------------------------
+
+// file conversion operations
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::trajFileHandle< T >::convert_to_tecplot(
+	const std::string& tecplot_filename,	// out: tecplot file
+	const std::string& data_filename)		// in[out]: binary file unformatted -> formatted
+{
+	if (!format(data_filename)) return false;
+
+	//
+	// converting formatted binary to tecplot file
+	//	assuming: 2D || 3D formatted data & descriptor is correct & sorted
+	//
+	FILE *data_ptr = fopen(data_filename.c_str(), "rb");
+	if (data_ptr == NULL) return false;
+
+	FILE *tecplot_ptr = fopen(tecplot_filename.c_str(), "w");
+	if (tecplot_ptr == NULL) { fclose(data_ptr); return false; }
+
+	if (ndim == 2) {
+		fprintf(tecplot_ptr, " TITLE = \"Trajectory 2D\"\n");
+		fprintf(tecplot_ptr, " VARIABLES = \"X\", \"Y\", \"Time\"\n");
+	}
+	if (ndim == 3) {
+		fprintf(tecplot_ptr, " TITLE = \"Trajectory 3D\"\n");
+		fprintf(tecplot_ptr, " VARIABLES = \"X\", \"Y\", \"Z\", \"Time\"\n");
+	}
+
+	// skipping binary file header
+	fseek(data_ptr, 4 * sizeof(int), SEEK_CUR);
+
+	T *iobuf;
+	int iobuf_id;
+	for (int i = 0; i < nrec; i++)
+	{
+		fprintf(tecplot_ptr, " ZONE T = \"%i\", I = %i, DATAPACKING = POINT\n",
+			tag[i], length[i]);
+
+		// skipping binary record header
+		fseek(data_ptr, 2 * sizeof(int) + 2 * sizeof(T), SEEK_CUR);
+
+		iobuf_id = memStx::get_buf(&iobuf, (ndim + 1) * length[i]);
+		if (fread(iobuf, sizeof(T), (ndim + 1) * length[i], data_ptr) !=
+			(ndim + 1) * length[i])
+		{
+			memStx::free_buf(iobuf_id);
+			fclose(data_ptr);
+			fclose(tecplot_ptr);
+			return false;
+		}
+
+		if (ndim == 2) {
+			for (int j = 0; j < length[i]; j++) {
+				fprintf(tecplot_ptr, "%f %f %f\n",
+					iobuf[3 * j], iobuf[3 * j + 1], iobuf[3 * j + 2]);
+			}
+		}
+		if (ndim == 3) {
+			for (int j = 0; j < length[i]; j++) {
+				fprintf(tecplot_ptr, "%f %f %f %f\n",
+					iobuf[4 * j], iobuf[4 * j + 1], iobuf[4 * j + 2], iobuf[4 * j + 3]);
+			}
+		}
+
+		memStx::free_buf(iobuf_id);
+	}
+
+	fclose(data_ptr);
+	fclose(tecplot_ptr);
+	return true;
+}
+
+template< typename T >
+bool nse::trajFileHandle< T >::convert_to_formatted_binary(
+	const std::string& data_formatted_filename,	// out: binary formatted file
+	const std::string& data_filename)			// in[out]: binary file unformatted -> formatted
+{
+	if (!format(data_filename)) return false;
+
+	return copy_file(data_filename, data_formatted_filename);
+}
+// ----------------------------------------------------------------------------
+
+// format binary file [records sorting and merging]
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::trajFileHandle< T >::format(
+	const std::string& filename)
+{
+	if (!read_descriptor(filename)) return false;	// reading descriptor
+
+	int nswap = sort_descriptor();	// sorting records by tag & time interval
+	if (nswap == 0) return true;	// file is already formatted
+
+	FILE *ptr = fopen(filename.c_str(), "rb");
+	if (ptr == NULL) return false;
+
+	FILE *tmp = tmpfile();
+	if (tmp == NULL) { fclose(ptr); return false; }
+
+	// writing header
+	int header[4] = { 't' + 'r' + 'j', ndim, sizeof(T), group };
+	if (fwrite(header, sizeof(int), 4, tmp) != 4) {
+		fclose(ptr); fclose(tmp); return false;
+	}
+
+	int i = 0, j;
+	int fidx = 0;	// current number of formatted records
+	while (i < nrec)
+	{
+		// getting index of next different tag
+		j = i + 1;
+		while (j < nrec) {
+			if (tag[i] != tag[j]) break;
+			j++;
+		}
+
+		int flength = 0;	// formatted record length
+		for (int k = i; k < j; k++) flength += length[k];
+
+		// writing record header
+		int record_header[2] = { tag[i], flength };
+		T record_time[2] = { start_time[i], end_time[j - 1] };
+
+		if (fwrite(record_header, sizeof(int), 2, tmp) != 2) {
+			fclose(ptr); fclose(tmp); return false;
+		}
+		if (fwrite(record_time, sizeof(T), 2, tmp) != 2) {
+			fclose(ptr); fclose(tmp); return false;
+		}
+
+		// writing data
+		T *iobuf;
+		int iobuf_id;
+		int io_count;
+		for (int k = i; k < j; k++) {
+
+			fseek(ptr, offset[k], SEEK_SET);
+
+			iobuf_id = memStx::get_buf(&iobuf, (ndim + 1) * length[k]);
+
+			io_count = fread(iobuf, sizeof(T), (ndim + 1) * length[k], ptr);
+			io_count += fwrite(iobuf, sizeof(T), (ndim + 1) * length[k], tmp);
+			if (io_count != 2 * (ndim + 1) * length[k]) {
+				memStx::free_buf(iobuf_id);
+				fclose(ptr); 
+				fclose(tmp);
+				return false;
+			}
+
+			memStx::free_buf(iobuf_id);
+		}
+
+		// setting formatted file handle
+		tag[fidx] = tag[i];
+		length[fidx] = flength;
+
+		start_time[fidx] = start_time[i];
+		end_time[fidx] = end_time[j - 1];
+		fidx++;
+
+		i = j;
+	}
+	nrec = fidx;
+
+	// setting offset for formatted handle
+	if (nrec > 0) {
+		offset[0] = 4 * sizeof(int) +			// header size
+			2 * sizeof(int) + 2 * sizeof(T);	// record header & time
+	}
+	for (int k = 1; k < nrec; k++) {
+		offset[k] = offset[k - 1] +
+			2 * sizeof(int) + 2 * sizeof(T) +			// record header & time
+			(ndim + 1) * length[k - 1] * sizeof(T);		// data
+	}
+
+
+	// copy sorted data back from temporary file
+	fclose(ptr);
+	ptr = fopen(filename.c_str(), "wb");
+	if (ptr == NULL) {
+		fclose(tmp); return false;
+	}
+
+	rewind(tmp);
+
+	const int chunk_size = 32 * 1024;
+	char chunk[chunk_size];
+
+	int num_bytes;
+	while (!feof(tmp)) {
+		num_bytes = fread(chunk, sizeof(char), chunk_size, tmp);
+		if ((num_bytes > 0) && (num_bytes <= chunk_size)) {
+			if (fwrite(chunk, sizeof(char), num_bytes, ptr) != num_bytes) {
+				fclose(ptr);
+				fclose(tmp);
+				return false;
+			}
+		}
+	}
+
+	fclose(ptr);
+	fclose(tmp);
+	return true;
+}
+// ----------------------------------------------------------------------------
+
+// format substeps:
+// ----------------------------------------------------------------------------
+
+// read binary file descriptor
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::trajFileHandle< T >::read_descriptor(const std::string& filename)
+{
+	clear_descriptor();
+
+	FILE *ptr = fopen(filename.c_str(), "rb");
+	if (ptr == NULL) return false;
+
+	// reading & checking header [any group id]
+	int header[4];
+	if (fread(header, sizeof(int), 4, ptr) != 4) {
+		fclose(ptr); return false; 
+	}
+	if ((header[0] != 't' + 'r' + 'j') ||
+		((header[1] != 2) && (header[1] != 3)) ||
+		(header[2] != sizeof(T)))
+	{
+		fclose(ptr);
+		return false;
+	}
+
+	// := number of dimensions and group id
+	ndim = header[1];
+	group = header[3];
+
+	// reading all record headers and time
+	int record_header[2];
+	T record_time[2];
+	int rcount;
+	while (!feof(ptr))
+	{
+		rcount = fread(record_header, sizeof(int), 2, ptr);
+		rcount += fread(record_time, sizeof(T), 2, ptr);
+
+		if (rcount == 4) {
+			resize(nrec + 1);
+
+			tag[nrec] = record_header[0];
+			length[nrec] = record_header[1];
+
+			start_time[nrec] = record_time[0];
+			end_time[nrec] = record_time[1];
+
+			fseek(ptr, (ndim + 1) * length[nrec] * sizeof(T), SEEK_CUR);
+			nrec++;
+		}
+	}
+	if (rcount != 0) { fclose(ptr); return false; }
+
+	// setting offset
+	if (nrec > 0) {
+		offset[0] = 4 * sizeof(int) +			// header size
+			2 * sizeof(int) + 2 * sizeof(T);	// record header & time
+	}
+	for (int k = 1; k < nrec; k++) {
+		offset[k] = offset[k - 1] +
+			2 * sizeof(int) + 2 * sizeof(T) +			// record header & time
+			(ndim + 1) * length[k - 1] * sizeof(T);		// data
+	}
+
+	fclose(ptr);
+	return true;
+}
+// ----------------------------------------------------------------------------
+
+// sort descriptor data
+//	insertion sort
+//	in ascending order based on tag and time interval
+// ----------------------------------------------------------------------------
+template< typename T >
+int nse::trajFileHandle< T >::sort_descriptor()
+{
+	int i, j, nswap = 0;
+	bool is_descending;
+	for (i = 1; i < nrec; i++) {
+		j = i;
+		while (j > 0)
+		{
+			is_descending = (tag[j - 1] > tag[j]) ||
+				((tag[j - 1] == tag[j]) &&
+				(start_time[j - 1] >= end_time[j]));
+			if (!is_descending) break;
+
+			nse::swap_vars(tag[j - 1], tag[j]);
+			nse::swap_vars(length[j - 1], length[j]);
+			nse::swap_vars(offset[j - 1], offset[j]);
+
+			nse::swap_vars(start_time[j - 1], start_time[j]);
+			nse::swap_vars(end_time[j - 1], end_time[j]);
+
+			nswap++;
+
+			j--;
+		}
+	}
+
+	return nswap;
+}
+// ----------------------------------------------------------------------------
+
+// clear descriptor data
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::trajFileHandle< T >::clear_descriptor()
+{
+	nrec = 0;
+}
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+
+
+// resize memory for requested number of elements
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::trajFileHandle< T >::resize(const int m)
+{
+	if (m > mem_size)
+	{
+		const int nalloc = max(m, mem_size + mem_alloc);
+
+		reallocate(&tag, nrec, mem_size, nalloc);
+		reallocate(&length, nrec, mem_size, nalloc);
+		reallocate(&offset, nrec, mem_size, nalloc);
+
+		reallocate(&start_time, nrec, mem_size, nalloc);
+		reallocate(&end_time, nrec, mem_size, nalloc);
+
+		mem_size = nalloc;
+	}
+}
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// Initialization [trajFileHandle]
+// ----------------------------------------------------------------------------
+template class nse::trajFileHandle<float>;
+template class nse::trajFileHandle<double>;
diff --git a/traj-file-handle.h b/traj-file-handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..217fc47d155611add005c06376bb8f1b0b05e74b
--- /dev/null
+++ b/traj-file-handle.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <string>
+#include "nse-sys.h"
+
+
+namespace nse
+{
+
+	// File handle for trajectory data
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	class trajFileHandle {
+	public:
+
+		// file conversion operations
+		//   in: binary data, including unformatted (not sorted) data
+		//	 out: tecplot file, binary formatted (sorted) data
+		// ----------------------------------------------------------------------------
+		bool convert_to_tecplot(
+			const std::string& tecplot_filename, const std::string& data_filename);
+
+		//   in: binary data, including unformatted (not sorted) data
+		//	 out: binary formatted (sorted) file
+		// ----------------------------------------------------------------------------
+		bool convert_to_formatted_binary(
+			const std::string& data_formatted_filename, const std::string& data_filename);
+
+
+		// ----------------------------------------------------------------------------
+		trajFileHandle();
+		trajFileHandle(const trajFileHandle< T >& handle);
+		~trajFileHandle();
+
+	private:
+
+		// format binary file with trajectory data
+		//	- 1) read file descriptor
+		//	- 2) sort descriptor data by tag and time
+		//	- 3) rewrite data by sorted descriptor
+		// ----------------------------------------------------------------------------
+		bool format(const std::string& filename);
+
+		// format substeps
+		// ----------------------------------------------------------------------------
+		bool read_descriptor(const std::string& filename);
+		int sort_descriptor();		// return number of swaps
+		void clear_descriptor();
+
+		// resize memory for requested number of elements
+		// ----------------------------------------------------------------------------
+		void resize(const int m);
+
+	private:
+		// descriptor data
+		// ----------------------------------------------------------------------------
+		int ndim;		// number of dimensions
+		int group;		// group id
+		int nrec;		// number of records
+
+		int *tag, *length, *offset;		// arrays[nrec]
+										//	- length: number of trajectory points in record
+										//	- offset: record offset in bytes from file start
+		T *start_time, *end_time;		// arrays[nrec]
+
+	private:
+		// memory management data
+		// ----------------------------------------------------------------------------
+		int mem_size;						// allocated memory size
+		static const int mem_alloc = 8;		// minimum memory allocation block
+	};
+	// -------------------------------------------------------------------------------------------- //
+}
diff --git a/traj3d.cpp b/traj3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79fada90201b6ca598912d8d550ee26d7456cd88
--- /dev/null
+++ b/traj3d.cpp
@@ -0,0 +1,255 @@
+#define _CRT_SECURE_NO_WARNINGS
+#include "traj3d.h"
+
+#include "nse-alloc.h"
+#include "vecmath.h"
+#include "mem-stx.h"
+
+#include <stdio.h>
+
+
+// Implementation [traj3d]
+// ----------------------------------------------------------------------------
+template< typename T >
+nse::traj3d< T >::traj3d() :
+	n(0), mem_size(0), tag(-1), group(-1)
+{
+}
+
+template< typename T >
+nse::traj3d< T >::traj3d(
+	const traj3d< T >& traj) :
+	n(traj.n), mem_size(traj.mem_size),
+	tag(traj.tag), group(traj.group)
+{
+	if (mem_size > 0) {
+		allocate(&x, &y, &z, mem_size);
+		allocate(&time, mem_size);
+	}
+
+	if (n > 0) {
+		mcopy(x, traj.x, n);
+		mcopy(y, traj.y, n);
+		mcopy(z, traj.z, n);
+
+		mcopy(time, traj.time, n);
+	}
+}
+
+template< typename T >
+nse::traj3d< T >::~traj3d()
+{
+	free();
+}
+// ----------------------------------------------------------------------------
+
+// swap and assignment operator
+// ----------------------------------------------------------------------------
+template< typename T >
+const nse::traj3d< T >& nse::traj3d< T >::operator=(
+	traj3d< T > traj)
+{
+	swap(traj);
+	return (*this);
+}
+
+template< typename T >
+void nse::traj3d< T >::swap(traj3d< T >& traj)
+{
+	nse::swap_vars(n, traj.n);
+	nse::swap_vars(mem_size, traj.mem_size);
+	nse::swap_vars(tag, traj.tag);
+	nse::swap_vars(group, traj.group);
+
+	nse::swap_vars(x, traj.x);
+	nse::swap_vars(y, traj.y);
+	nse::swap_vars(z, traj.z);
+	nse::swap_vars(time, traj.time);
+}
+// ----------------------------------------------------------------------------
+
+// get calls
+// ----------------------------------------------------------------------------
+template< typename T >
+T nse::traj3d< T >::get_start_time() const
+{
+	if (n > 0) return *time;
+	else
+		return T();
+}
+template< typename T >
+T nse::traj3d< T >::get_end_time() const
+{
+	if (n > 0) return time[n - 1];
+	else
+		return T();
+}
+// ----------------------------------------------------------------------------
+
+// add particle coordinates to trajectory
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::traj3d< T >::push_back(
+	const T _x, const T _y, const T _z, const T _time)
+{
+	resize(n + 1);
+
+	x[n] = _x;
+	y[n] = _y;
+	z[n] = _z;
+	time[n] = _time;
+
+	n++;
+}
+// ----------------------------------------------------------------------------
+
+// copy particle trajectory
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::traj3d< T >::copy_to_pointer(T* _RESTRICT buf) const
+{
+	int k;
+#pragma omp parallel for private(k) shared(buf)
+	for (k = 0; k < n; k++) {
+		buf[4 * k] = x[k];
+		buf[4 * k + 1] = y[k];
+		buf[4 * k + 2] = z[k];
+		buf[4 * k + 3] = time[k];
+	}
+}
+
+template< typename T >
+void nse::traj3d< T >::copy_to_pointer(
+	T* _RESTRICT buf, const int idx, const int length) const
+{
+	int k;
+#pragma omp parallel for private(k) shared(buf)
+	for (k = 0; k < length; k++) {
+		buf[4 * k] = x[k + idx];
+		buf[4 * k + 1] = y[k + idx];
+		buf[4 * k + 2] = z[k + idx];
+		buf[4 * k + 3] = time[k + idx];
+	}
+}
+// ----------------------------------------------------------------------------
+
+// clear-free subroutines
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::traj3d< T >::clear()
+{
+	n = 0;
+}
+
+template< typename T >
+void nse::traj3d< T >::free()
+{
+	if (mem_size > 0) {
+		deallocate(x, y, z);
+		deallocate(time);
+
+		mem_size = 0;
+	}
+
+	n = 0;
+	tag = -1;
+	group = -1;
+}
+// ----------------------------------------------------------------------------
+
+// I/O operations
+// ----------------------------------------------------------------------------
+template< typename T >
+bool nse::traj3d< T >::dump_to_file(
+	const std::string& filename, const bool create_file)
+{
+	if (create_file) {	// creating data file for group
+		if (!create_dump_file(filename, group)) return false;
+	}
+	else
+	{
+		// checking file header
+		FILE *ptr = fopen(filename.c_str(), "rb");
+		if (ptr == NULL) return false;
+
+		int header[4];
+		int nstatus = fread(header, sizeof(int), 4, ptr);
+
+		fclose(ptr);
+		if ((header[0] != 't' + 'r' + 'j') ||
+			(header[1] != 3) ||
+			(header[2] != sizeof(T)) ||
+			(header[3] != group)) return false;
+	}
+
+	if (n == 0) return true;	// empty trajectory case
+
+								// appending trajectory data
+	FILE *ptr = fopen(filename.c_str(), "ab");
+	if (ptr == NULL) return false;
+
+	// record header
+	int record_header[2] = { tag, n };
+	T record_time[2] = { get_start_time(), get_end_time() };
+	fwrite(record_header, sizeof(int), 2, ptr);
+	fwrite(record_time, sizeof(T), 2, ptr);
+
+	// data
+	T *iobuf;
+	int iobuf_id = memStx::get_buf(&iobuf, 4 * n);
+	copy_to_pointer(iobuf);
+
+	fwrite(iobuf, sizeof(T), 4 * n, ptr);
+
+	memStx::free_buf(iobuf_id);
+
+	fclose(ptr);
+	clear();		// removing trajectory data from memory
+	return true;
+}
+
+template< typename T >
+bool nse::traj3d< T >::create_dump_file(
+	const std::string& filename, const int group)
+{
+	FILE *ptr = fopen(filename.c_str(), "wb");
+	if (ptr == NULL) return false;
+
+	const int header[4] = {
+		't' + 'r' + 'j',		// binary id
+		3,						// number of dimensions
+		sizeof(T),				// data type size
+		group,					// group id
+	};
+
+	fwrite(header, sizeof(int), 4, ptr);
+	fclose(ptr);
+	return true;
+}
+// ----------------------------------------------------------------------------
+
+// resize memory for requested number of elements
+// ----------------------------------------------------------------------------
+template< typename T >
+void nse::traj3d< T >::resize(const int m)
+{
+	if (m > mem_size)
+	{
+		const int nalloc = max(m, mem_size + mem_alloc);
+		reallocate(&x, n, mem_size, nalloc);
+		reallocate(&y, n, mem_size, nalloc);
+		reallocate(&z, n, mem_size, nalloc);
+		reallocate(&time, n, mem_size, nalloc);
+
+		mem_size = nalloc;
+	}
+}
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// Initialization [traj3d]
+// ----------------------------------------------------------------------------
+template class nse::traj3d<float>;
+template class nse::traj3d<double>;
diff --git a/traj3d.h b/traj3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..cac4e3c2029e355058addf32ebf78e62142c869a
--- /dev/null
+++ b/traj3d.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <string>
+#include "nse-sys.h"
+
+
+namespace nse
+{
+	// forward declarations
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T > class trajAccum3d;
+
+
+	// 2d trajectory data
+	// -------------------------------------------------------------------------------------------- //
+	template< typename T >
+	class traj3d {
+
+		friend class nse::trajAccum3d< T >;
+
+	public:
+
+		// add particle coordinates to trajectory
+		// ----------------------------------------------------------------------------
+		void push_back(const T x, const T y, const T z, const T time);
+
+		// swap and assignment operator
+		// ----------------------------------------------------------------------------
+		void swap(traj3d< T >& traj);
+		const traj3d& operator=(traj3d< T > traj);
+
+		// get calls
+		// ----------------------------------------------------------------------------
+		T get_start_time() const;
+		T get_end_time() const;
+
+		// copy particle trajectory
+		// ----------------------------------------------------------------------------
+		void copy_to_pointer(T* _RESTRICT buf) const;
+		void copy_to_pointer(T* _RESTRICT buf, const int idx, const int length) const;
+
+		// I/O operations
+		// ----------------------------------------------------------------------------
+		bool dump_to_file(const std::string& filename, const bool create_file);
+		static bool create_dump_file(const std::string& filename, const int group);
+
+		// clear-free subroutines
+		// ----------------------------------------------------------------------------
+		void clear();	// remove trajectory but keep memory
+		void free();	// remove trajectory and memory
+
+
+		// ----------------------------------------------------------------------------
+		traj3d();
+		traj3d(const traj3d< T >& traj);
+		~traj3d();
+
+	private:
+
+		// resize memory for requested number of elements
+		// ----------------------------------------------------------------------------
+		void resize(const int m);
+
+		int tag, group;			// trajectory tag & group
+		int n;					// number of elements 
+		T *x, *y, *z, *time;	// coordinates
+
+		int mem_size;						// allocated memory size
+		static const int mem_alloc = 128;	// minimum memory allocation block
+	};
+	// -------------------------------------------------------------------------------------------- //
+}
diff --git a/vecmath.h b/vecmath.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dbe9f08e9e4ab73e90cba5cc0b0532888d0e54f
--- /dev/null
+++ b/vecmath.h
@@ -0,0 +1,2774 @@
+#pragma once
+
+// [vecmath.h]: vector math simple template functions
+//
+// -------------------------------------------------------------------------------------------- //
+// TO DO:
+//	- use assume_aligned for INTEL compiler
+// 
+
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
+#include "nse-sys.h"
+#include "nse-alloc.h"
+
+#include <math.h>
+#ifdef USE_EXPLICIT_SSE
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#endif
+
+#ifndef EXCLUDE_GPU_BRANCH
+#include "vecmath.cuh"
+#endif
+
+namespace nse
+{
+	// * point functions * //
+	template< typename T > T min(const T a, const T b);
+	template< typename T > T max(const T a, const T b);
+
+	template< typename T > T min(const T a, const T b, const T c);
+	template< typename T > T max(const T a, const T b, const T c);
+
+	template< typename T > T min(const T a, const T b, const T c, const T d);
+	template< typename T > T max(const T a, const T b, const T c, const T d);
+
+	template< typename T > T min(const T a, const T b, const T c, const T d, const T e);
+	template< typename T > T max(const T a, const T b, const T c, const T d, const T e);
+
+	template< typename T > T sign(const T x);
+	template< typename T > T sign(const T x, const T eps);
+
+	template< typename T > T minmod_limit(const T x, const T y);
+	template< typename T > T superbee_limit(const T x, const T y);
+
+	template< typename T > T delta(const T x, const T eps);
+
+	template< typename T > T heavy_side(const T x);
+	template< typename T > T heavy_side(const T x, const T eps);
+
+	template< typename T > T linear_step(const T x, const T eps);
+	// ----------------------------------------------------------------------------------------- //
+
+	// * allocation null vectors * //
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, const int nx, const int ny);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z, const int nx, const int ny, const int nz);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z, T** p, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z, T** p, T** q, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z, T** p, T** q, T** s, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z, 
+		T** p, T** q, T** s, T** u, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z,
+		T** p, T** q, T** s, T** u, T** v, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z,
+		T** p, T** q, T** s, T** u, T** v, T** w, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z, T** p, T** q, T** s, 
+		T** u, T** v, T** w, T** a, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z, T** p, T** q, T** s,
+		T** u, T** v, T** w, T** a, T** b, const int n);
+	template< memType mem = memCPU, typename T >
+	bool allocate_vnull(T** x, T** y, T** z, T** p, T** q, T** s,
+		T** u, T** v, T** w, T** a, T** b, T** c, const int n);
+
+	// * array reduction * //
+	template< memType mem = memCPU, typename T >
+	T min(const T* _RESTRICT const x, const int n);
+
+	template< memType mem = memCPU, typename T >
+	T max(const T* _RESTRICT const x, const int n);
+
+	template< memType mem = memCPU, typename T >
+	T sum(const T* _RESTRICT const x, const int n);
+
+	template< memType mem = memCPU, typename T > // = (x,y)
+	T dot_product(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n);
+	template< typename T, typename CType >	// = (x,y)
+	T dot_product_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+
+	template< memType mem = memCPU, typename T > // = (x,x)
+	T sqr_sum(const T* _RESTRICT const x, const int n);
+	template< typename T, typename CType >
+	T sqr_sum_ifeq(const T* _RESTRICT const x,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+
+	template< memType mem = memCPU, typename T > // = (x,x) & (x,y)
+	void sqr_sum_and_dp(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+		T* _RESTRICT sum, T* _RESTRICT dp);
+	template< typename T, typename CType >
+	void sqr_sum_and_dp_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		T* _RESTRICT sum, T* _RESTRICT dp);
+
+	template< memType mem = memCPU, typename T > // = sqrt(x,x)
+	T lnorm(const T* _RESTRICT const x, const int n);
+	template< typename T, typename CType >
+	T lnorm_ifeq(const T* _RESTRICT const x,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+
+	template< memType mem = memCPU, typename T > // = sqrt(x,x) & (x,y)
+	void lnorm_and_dp(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+	template< typename T, typename CType >
+	void lnorm_and_dp_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+
+	template< memType mem = memCPU, typename T > // = sqrt(x,x) & (x,x)
+	void lnorm_and_sqr_sum(const T* _RESTRICT const x, const int n,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+	template< typename T, typename CType >
+	void lnorm_and_sqr_sum_ifeq(const T* _RESTRICT const x, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+
+	template< memType mem = memCPU, typename T > // = max|x|
+	T cnorm(const T* _RESTRICT const x, const int n);
+	template< typename T, typename CType >
+	T cnorm_ifeq(const T* _RESTRICT const x,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+
+	template< memType mem = memCPU, typename T > // = max|x| & (x,y)
+	void cnorm_and_dp(const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+	template< typename T, typename CType >
+	void cnorm_and_dp_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		T* _RESTRICT norm, T* _RESTRICT dp);
+
+	template< memType mem = memCPU, typename T > // = max|x| & (x,x)
+	void cnorm_and_sqr_sum(const T* _RESTRICT const x, const int n,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+	template< typename T, typename CType >
+	void cnorm_and_sqr_sum_ifeq(const T* _RESTRICT const x, 
+		const CType* _RESTRICT const mask, const CType check, const int n,
+		T* _RESTRICT norm, T* _RESTRICT sum);
+
+	template< memType mem = memCPU, typename T > // = sum|x|
+	T l1norm(const T* _RESTRICT const x, const int n);
+	// ----------------------------------------------------------------------------------------- //
+
+	// * vector math * //
+	template< memType mem = memCPU, typename T > // x = 0
+	void null(T* _RESTRICT x, const int n);
+	template< typename T, typename CType >
+	void null_ifeq(T* _RESTRICT x,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+
+	template< memType mem = memCPU, typename T > // x += alpha
+	void update(T* _RESTRICT x,
+		const T alpha, const int n);
+	template< typename T, typename CType >
+	void update_ifeq(T* _RESTRICT x,
+		const T alpha,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+
+	template< memType mem = memCPU, typename T > // x += alpha * y
+	void update(T* _RESTRICT x,
+		const T alpha, const T* _RESTRICT const y, const int n);
+	template< typename T, typename CType >
+	void update_ifeq(T* _RESTRICT x,
+		const T alpha, const T* _RESTRICT const y,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+
+	template< memType mem = memCPU, typename T > // x += y
+	void update(T* _RESTRICT x,
+		const T* _RESTRICT const y, const int n);
+
+	template< memType mem = memCPU, typename T > // x += alpha * y + beta * z
+	void update(T* _RESTRICT x,
+		const T alpha, const T* _RESTRICT const y,
+		const T beta, const T* _RESTRICT const z, const int n);
+
+	template< memType mem = memCPU, typename T > // x += alpha * y + beta * z + gamma * w
+	void update(T* _RESTRICT x,
+		const T alpha, const T* _RESTRICT const y,
+		const T beta, const T* _RESTRICT const z,
+		const T gamma, const T* _RESTRICT const w, const int n);
+
+	template< memType mem = memCPU, typename T > // x += alpha * z, y += beta * w 
+	void update(T* _RESTRICT x, T* _RESTRICT y,
+		const T alpha, const T beta,
+		const T* _RESTRICT const z, const T* _RESTRICT const w, const int n);
+	template< typename T, typename CType >
+	void update_ifeq(T* _RESTRICT x, T* _RESTRICT y,
+		const T alpha, const T beta,
+		const T* _RESTRICT const z, const T* _RESTRICT const w,
+		const CType* _RESTRICT const mask, const CType check, const int n);
+
+#ifdef USE_EXPLICIT_SSE
+	void update_sse(float* _RESTRICT x, float* _RESTRICT y,
+		const float alpha, const float beta,
+		const float* _RESTRICT const z, const float* _RESTRICT const w, const int n);
+	void update_sse(double* _RESTRICT x, double* _RESTRICT y,
+		const double alpha, const double beta,
+		const double* _RESTRICT const z, const double* _RESTRICT const w, const int n);
+#endif
+
+	template< memType mem = memCPU, typename T > // x = alpha
+	void assign(T* _RESTRICT x, const T alpha, const int n);
+
+	template< memType mem = memCPU, typename T > // x = alpha * y
+	void assign(T* _RESTRICT x,
+		const T alpha, const T* _RESTRICT const y, const int n);
+
+	template< memType mem = memCPU, typename T > // x = alpha * y + beta * z
+	void assign(T* _RESTRICT x,
+		const T alpha, const T* _RESTRICT const y,
+		const T beta, const T* _RESTRICT const z, const int n);
+	template< typename T, typename CType >
+	void assign_ifeq(T* _RESTRICT x,
+		const T alpha, const T* _RESTRICT const y,
+		const T beta, const T* _RESTRICT const z, 
+		const CType* _RESTRICT const mask, const CType check, const int n);
+
+	template< memType mem = memCPU, typename T > // x = alpha * y + beta * z + gamma * w
+	void assign(T* _RESTRICT x,
+		const T alpha, const T* _RESTRICT const y,
+		const T beta, const T* _RESTRICT const z,
+		const T gamma, const T* _RESTRICT const w, const int n);
+	template< typename T, typename CType >
+	void assign_ifeq(T* _RESTRICT x,
+		const T alpha, const T* _RESTRICT const y,
+		const T beta, const T* _RESTRICT const z,
+		const T gamma, const T* _RESTRICT const w, 
+		const CType* _RESTRICT const mask, const CType check, const int n);
+
+	template< memType mem = memCPU, typename T > // x = alpha * y + beta * z + gamma * w + zeta * p
+	void assign(T* _RESTRICT x,
+		const T alpha, const T* _RESTRICT const y,
+		const T beta, const T* _RESTRICT const z,
+		const T gamma, const T* _RESTRICT const w, 
+		const T delta, const T* _RESTRICT const p, const int n);
+
+	template< memType mem = memCPU, typename T >	// sum = x + y
+	void vsum(T* _RESTRICT sum,
+		const T* _RESTRICT const x, const T* _RESTRICT const y, const int n);
+
+	template< memType mem = memCPU, typename T >	// sum = x + y + z
+	void vsum(T* _RESTRICT sum,
+		const T* _RESTRICT const x, const T* _RESTRICT const y, 
+		const T* _RESTRICT const z, const int n);
+
+	template< memType mem = memCPU, typename T >	// sum = x + y + z + p
+	void vsum(T* _RESTRICT sum,
+		const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const T* _RESTRICT const z, const T* _RESTRICT const p, const int n);
+
+	template< memType mem = memCPU, typename T >	// sum = x + y + z + p + q
+	void vsum(T* _RESTRICT sum,
+		const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const T* _RESTRICT const z, const T* _RESTRICT const p, 
+		const T* _RESTRICT const q, const int n);
+
+	template< memType mem = memCPU, typename T >	// sum = x + y + z + p + q + w
+	void vsum(T* _RESTRICT sum,
+		const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const T* _RESTRICT const z, const T* _RESTRICT const p,
+		const T* _RESTRICT const q, const T* _RESTRICT const w, const int n);
+
+	template< memType mem = memCPU, typename T >	// sum = x + y + z + p + q + w + v
+	void vsum(T* _RESTRICT sum,
+		const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const T* _RESTRICT const z, const T* _RESTRICT const p,
+		const T* _RESTRICT const q, const T* _RESTRICT const w,
+		const T* _RESTRICT const v, const int n);
+
+	template< memType mem = memCPU, typename T >	// sub = x - y
+	void vsub(T* _RESTRICT sub,
+		const T* _RESTRICT const x, const T* _RESTRICT const y, const int n);
+
+	template< memType mem = memCPU, typename T > // x *= value
+	void mul(T* _RESTRICT x, const T value, const int n);
+
+	template< memType mem = memCPU, typename T > // y *= x
+	void mul(T* _RESTRICT y, const T* x, const int n);
+
+	template< memType mem = memCPU, typename T >
+	void mul(T* _RESTRICT y, // y = x * z
+		const T* _RESTRICT const x, const T* _RESTRICT const z, const int n);
+
+	template< memType mem = memCPU, typename T >
+	void vdiv(T* _RESTRICT y, // y = x / z
+		const T* _RESTRICT const x, const T* _RESTRICT const z, const int n);
+	// ----------------------------------------------------------------------------------------- //
+
+	// * statistics * //
+	template< typename T >	// ( E(x^2) - E(x)*E(x) )
+	void variance(T* _RESTRICT var,
+		const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n);
+
+	template< typename T >	// sqrt( E(x^2) - E(x)*E(x) )
+	void deviation(T* _RESTRICT sigma,
+		const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n);
+
+	template< typename T >	// sqrt( |E(x^2) - E(x)*E(x)| )
+	void deviation_abs(T* _RESTRICT sigma,
+		const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n);
+
+	template< typename T >	// sqrt( Var(x) = {E(x^2) - E(x)*E(x)} )
+	void deviation(T* _RESTRICT sigma,
+		const T* _RESTRICT const varx, const int n);
+
+	template< typename T >
+	T max_deviation(const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n);
+
+	template< typename T >	// E(x*y) - E(x)*E(y)
+	void covariance(T* _RESTRICT cov,
+		const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const T* _RESTRICT const xy, const int n);
+
+	template< typename T >	// E(x^3) - 3 * E(x)*E(x^2) + 2 * [E(x)]^3
+	void skewness(T* _RESTRICT skew,
+		const T* _RESTRICT const x, const T* _RESTRICT const x2,
+		const T* _RESTRICT const x3, const int n);
+
+	template< typename T >	// E(x^2*y) - E(x^2)*E(y) - 2 * E(x)*E(x*y) + 2 * E(x)*E(x)*E(y)
+	void coskewness(T* _RESTRICT coskew,
+		const T* _RESTRICT const x, const T* _RESTRICT const y,
+		const T* _RESTRICT const x2, const T* _RESTRICT const xy,
+		const T* _RESTRICT const x2y, const int n);
+	// ----------------------------------------------------------------------------------------- //
+
+	// * dense matrix operations * //
+	template< memType mem = memCPU, typename T >  // y[n] = matrix[n * n] * x[n]
+	void matvec(T* _RESTRICT y,
+		const T* _RESTRICT const matrix, const T* _RESTRICT const x, const int n);
+
+	template< typename T >  // y[n] = x[n] * matrix[n * n]
+	void vecmat(T* _RESTRICT y,
+		const T* _RESTRICT const x, const T* _RESTRICT const matrix, const int n);
+
+	template< memType mem = memCPU, typename T >  // y[n] = rhs[ n ] - matrix[n * n] * x[n]
+	void resvec(T* _RESTRICT res,
+		const T* _RESTRICT const rhs, const T* _RESTRICT const matrix,
+		const T* _RESTRICT const x, const int n);
+
+	// * matrix simple operations * //
+	template< typename T >
+	T det(const T a11, const T a12,
+		const T a21, const T a22);
+
+	template< typename T >
+	T det(const T a11, const T a12, const T a13,
+		const T a21, const T a22, const T a23,
+		const T a31, const T a32, const T a33);
+
+	template< typename T >
+	T det(const T a11, const T a12, const T a13, const T a14,
+		const T a21, const T a22, const T a23, const T a24,
+		const T a31, const T a32, const T a33, const T a34,
+		const T a41, const T a42, const T a43, const T a44);
+
+	template< typename T >  // matrix[n * n]
+	T det(const T* _RESTRICT const matrix, const int n);
+
+	template< typename T >  // matrix[n * n] cofactor(i, j) determinant 
+	T cofactor(const int i, const int j,
+		const T* _RESTRICT const matrix, const int size);
+
+	template< typename T >  // matrix[n * n] inverse
+	bool inverse(T* _RESTRICT inv_matrix,
+		const T* _RESTRICT const matrix, const int n);
+	// ----------------------------------------------------------------------------------------- //
+
+	// * interpolation * //
+	template< typename T >
+	T interp_bilinear(const T x, const T y,
+		const T v00, const T v10,	// - x line
+		const T v01, const T v11);	// - x line with y shift
+
+	template< typename T >
+	T interp_bilinear(const T x, const T y,
+		const T sx, const T sy, const T dx, const T dy,
+		const T v00, const T v10,	// - x line
+		const T v01, const T v11);	// - x line with y shift
+
+	template< typename T >
+	T interp_trilinear(const T x, const T y, const T z,
+		const T v000, const T v100,		// - x line
+		const T v010, const T v110,		// - x line with y shift
+		const T v001, const T v101,		// - x line with z shift
+		const T v011, const T v111);	// - x line with y,z shift
+
+	template< typename T >
+	T interp_trilinear(const T x, const T y, const T z,
+		const T sx, const T sy, const T sz, const T dx, const T dy, const T dz,
+		const T v000, const T v100,		// - x line
+		const T v010, const T v110,		// - x line with y shift
+		const T v001, const T v101,		// - x line with z shift
+		const T v011, const T v111);	// - x line with y,z shift
+
+	template< typename T >		// interpolation at [z = pz] of X(z)
+	T interp(const T pz,
+		const T* _RESTRICT X, const T* _RESTRICT z, const int n);
+
+										
+	template< typename T >	// * ODE solver * //
+	void runge_kutta_o4(T* _RESTRICT ynext, T* _RESTRICT unext,
+		const T y0, const T u0, const T f,
+		const T alpha, const T beta,
+
+		const T dt, const int niters);
+	// ----------------------------------------------------------------------------------------- //
+
+	// * simple damping functions * //
+	template< typename T >
+	T linear_damping(const T t, const T T0, const T T1);
+
+	template< typename T >
+	T exp_damping(const T t, const T T0, const T Tperiod);
+	// ----------------------------------------------------------------------------------------- //
+
+	// * check for finite values * //
+	template< memType mem = memCPU, typename T >
+	bool is_finite(const T* _RESTRICT const x, const int n);
+	// ----------------------------------------------------------------------------------------- //
+}
+
+// ----------------------------------------------------------------------------------------- //
+// * Implementation *
+// ----------------------------------------------------------------------------------------- //
+template< typename T >
+inline T nse::min(
+	const T a, const T b)
+{
+	return (a < b) ? a : b;
+}
+
+template< typename T >
+inline T nse::max(
+	const T a, const T b)
+{
+	return (a > b) ? a : b;
+}
+
+template< typename T >
+inline T nse::min(
+	const T a, const T b, const T c)
+{
+	return (a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c);
+}
+
+template< typename T >
+inline T nse::max(
+	const T a, const T b, const T c)
+{
+	return (a > b) ? ((a > c) ? a : c) : ((b > c) ? b : c);
+}
+
+template< typename T >
+inline T nse::min(
+	const T a, const T b, const T c, const T d)
+{
+	return min(min(a, b, c), d);
+}
+
+template< typename T >
+inline T nse::max(
+	const T a, const T b, const T c, const T d)
+{
+	return max(max(a, b, c), d);
+}
+
+template< typename T >
+inline T nse::min(
+	const T a, const T b, const T c, const T d, const T e)
+{
+	return min(min(a, b), min(c, d, e));
+}
+
+template< typename T >
+inline T nse::max(
+	const T a, const T b, const T c, const T d, const T e)
+{
+	return max(max(a, b), max(c, d, e));
+}
+
+
+template< typename T >
+inline T nse::sign(
+	const T x)
+{
+	return (x > (T)0) ? (T)1 :
+		((x < (T)0) ? (T)-1 : (T)0);
+}
+
+template< typename T >
+inline T nse::sign(const T x, const T eps)
+{
+	if (fabs(x) < eps) return (T)0;
+	else
+		return (x < (T)0) ? -(T)1 : (T)1;
+}
+
+template< typename T >
+inline T nse::minmod_limit(const T x, const T y)
+{
+	if (x * y <= (T)0) return (T)0;
+
+	return (fabs(y) < fabs(x)) ? y : x;
+}
+
+template< typename T >
+inline T nse::superbee_limit(const T x, const T y)
+{
+	if (x * y <= (T)0) return (T)0;
+
+	const T xabs = fabs(x), yabs = fabs(y);
+	const T sig_x = (T)(((T)0 < x) - (x < (T)0));
+
+	if ((xabs > yabs + yabs) || (yabs > xabs + xabs))
+		return (T) 2.0 * sig_x * ((xabs < yabs) ? xabs : yabs);
+	else
+		return sig_x * ((xabs > yabs) ? xabs : yabs);
+}
+
+template< typename T >
+inline T nse::delta(const T x, const T eps)
+{
+	if ((x > eps) || (x < -eps)) return (T) 0.0;
+
+	T ieps = (T)1 / eps;
+	return (T) 0.5 * (
+		(T)1 + cos((T)M_PI * x * ieps)) * ieps;
+}
+
+template< typename T >
+inline T nse::heavy_side(const T x)
+{
+	return (x < (T)0) ? (T)0 : (T)1;
+}
+
+template< typename T >
+inline T nse::heavy_side(const T x, const T eps)
+{
+	if (x < -eps) return (T)0;
+	if (x > eps) return (T)1;
+
+	const T z = x / eps;
+
+	return (T) 0.5 * (
+		(T)1 + z + (sin((T)M_PI * z) * (T)M_1_PI));
+}
+
+template< typename T >
+inline T nse::linear_step(const T x, const T eps)
+{
+	if (x < -eps) return (T)0;
+	if (x > eps) return (T)1;
+
+	return (T) 0.5 * ((T)1 + (x / eps));
+
+}
+// ----------------------------------------------------------------------------------------- //
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, const int n)
+{
+	if (!allocate<mem>(x, n)) return false;
+
+	null<mem>((*x), n);
+	return true;
+}
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, T** y, const int n)
+{
+	if (!allocate<mem>(x, y, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n);
+	return true;
+}
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, T** y, const int nx, const int ny)
+{
+	if (!allocate<mem>(x, y, nx, ny)) return false;
+
+	null<mem>((*x), nx); null<mem>((*y), ny);
+	return true;
+}
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, T** y, T** z, const int n)
+{
+	if (!allocate<mem>(x, y, z, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n); null<mem>((*z), n);
+	return true;
+}
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, T** y, T** z, const int nx, const int ny, const int nz)
+{
+	if (!allocate<mem>(x, y, z, nx, ny, nz)) return false;
+
+	null<mem>((*x), nx); null<mem>((*y), ny); null<mem>((*z), nz);
+	return true;
+}
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, T** y, T** z, T** p, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n); 
+	null<mem>((*z), n); null<mem>((*p), n);
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, T** y, T** z, T** p, T** q, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n);
+	null<mem>((*z), n); null<mem>((*p), n); null<mem>((*q), n);
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, T** y, T** z, T** p, T** q, T** s, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n); null<mem>((*z), n); 
+	null<mem>((*p), n); null<mem>((*q), n); null<mem>((*s), n);
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, T** y, T** z, 
+	T** p, T** q, T** s, T** u, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, u, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n); null<mem>((*z), n);
+	null<mem>((*p), n); null<mem>((*q), n); null<mem>((*s), n);
+	null<mem>((*u), n);
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, T** y, T** z,
+	T** p, T** q, T** s, T** u, T** v, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, u, v, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n); null<mem>((*z), n);
+	null<mem>((*p), n); null<mem>((*q), n); null<mem>((*s), n);
+	null<mem>((*u), n); null<mem>((*v), n);
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(T** x, T** y, T** z,
+	T** p, T** q, T** s, T** u, T** v, T** w, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, u, v, w, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n); null<mem>((*z), n);
+	null<mem>((*p), n); null<mem>((*q), n); null<mem>((*s), n);
+	null<mem>((*u), n); null<mem>((*v), n); null<mem>((*w), n);
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(
+	T** x, T** y, T** z, T** p, T** q, T** s, 
+	T** u, T** v, T** w, T** a, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, u, v, w, a, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n); null<mem>((*z), n);
+	null<mem>((*p), n); null<mem>((*q), n); null<mem>((*s), n);
+	null<mem>((*u), n); null<mem>((*v), n); null<mem>((*w), n);
+	null<mem>((*a), n);
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(
+	T** x, T** y, T** z, T** p, T** q, T** s,
+	T** u, T** v, T** w, T** a, T** b, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, u, v, w, a, b, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n); null<mem>((*z), n);
+	null<mem>((*p), n); null<mem>((*q), n); null<mem>((*s), n);
+	null<mem>((*u), n); null<mem>((*v), n); null<mem>((*w), n);
+	null<mem>((*a), n); null<mem>((*b), n);
+	return true;
+}
+
+template< nse::memType mem, typename T >
+inline bool nse::allocate_vnull(
+	T** x, T** y, T** z, T** p, T** q, T** s,
+	T** u, T** v, T** w, T** a, T** b, T** c, const int n)
+{
+	if (!allocate<mem>(x, y, z, p, q, s, u, v, w, a, b, c, n)) return false;
+
+	null<mem>((*x), n); null<mem>((*y), n); null<mem>((*z), n);
+	null<mem>((*p), n); null<mem>((*q), n); null<mem>((*s), n);
+	null<mem>((*u), n); null<mem>((*v), n); null<mem>((*w), n);
+	null<mem>((*a), n); null<mem>((*b), n); null<mem>((*c), n);
+	return true;
+}
+// ----------------------------------------------------------------------------------------- //
+
+template< nse::memType mem, typename T >
+inline T nse::min(
+	const T* _RESTRICT const x, const int n)
+{
+	if (n <= 0) return (T)0;
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) return nse_gpu::min(x, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+		T _min = (T)(*x);
+
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+		T _min_local = (T)(*x);
+
+#pragma omp parallel firstprivate(_min_local) private(i) shared(_min)
+		{
+#pragma omp for nowait
+			for (i = 1; i < n; i++) {
+				if (x[i] < _min_local) _min_local = x[i];
+			}
+
+#pragma omp critical
+			{
+				if (_min_local < _min) _min = _min_local;
+			}
+		}
+
+#else
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i) reduction(min:_min)
+#endif
+		for (i = 1; i < n; i++) {
+			if (x[i] < _min) _min = x[i];
+		}
+
+#endif
+
+		return _min;
+	}
+}
+
+template< nse::memType mem, typename T >
+inline T nse::max(
+	const T* _RESTRICT const x, const int n)
+{
+	if (n <= 0) return (T)0;
+
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) return nse_gpu::max(x, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+		T _max = (T)(*x);
+
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+		T _max_local = (T)(*x);
+
+#pragma omp parallel firstprivate(_max_local) private(i) shared(_max)
+		{
+#pragma omp for nowait
+			for (i = 1; i < n; i++) {
+				if (x[i] > _max_local) _max_local = x[i];
+			}
+
+#pragma omp critical
+			{
+				if (_max_local > _max) _max = _max_local;
+			}
+		}
+#else
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i) reduction(max:_max)
+#endif
+		for (i = 1; i < n; i++)
+			if (x[i] > _max) _max = x[i];
+
+#endif
+
+		return _max;
+	}
+}
+
+template< nse::memType mem, typename T >
+inline T nse::sum(
+	const T* _RESTRICT const x, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) return nse_gpu::sum(x, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+		T _sum = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _sum )
+		for (i = 0; i < n - (n % 4); i += 4) {
+			_sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
+		}
+
+		for (i = n - (n % 4); i < n; i++)
+			_sum += x[i];
+
+		return _sum;
+	}
+}
+
+template< nse::memType mem, typename T >
+inline T nse::dot_product(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) return nse_gpu::dot_product(x, y, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+		T _dp = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _dp )
+		for (i = 0; i < n - (n % 4); i += 4) {
+			_dp += x[i] * y[i] + x[i + 1] * y[i + 1] +
+				x[i + 2] * y[i + 2] + x[i + 3] * y[i + 3];
+		}
+
+		for (i = n - (n % 4); i < n; i++)
+			_dp += x[i] * y[i];
+		return _dp;
+	}
+}
+
+template< typename T, typename CType >
+inline T nse::dot_product_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+	int i;
+	T _dp = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _dp )
+	for (i = 0; i < n - (n % 4); i += 4) {
+		if (mask[i] == check) _dp += x[i] * y[i];
+		if (mask[i + 1] == check) _dp += x[i + 1] * y[i + 1];
+		if (mask[i + 2] == check) _dp += x[i + 2] * y[i + 2];
+		if (mask[i + 3] == check) _dp += x[i + 3] * y[i + 3];
+	}
+
+	for (i = n - (n % 4); i < n; i++)
+		if (mask[i] == check) _dp += x[i] * y[i];
+	return _dp;
+}
+
+template< nse::memType mem, typename T >
+inline T nse::sqr_sum(
+	const T* _RESTRICT const x, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) return nse_gpu::sqr_sum(x, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+		T _sum = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _sum )
+		for (i = 0; i < n - (n % 4); i += 4) {
+			_sum += x[i] * x[i] + x[i + 1] * x[i + 1] +
+				x[i + 2] * x[i + 2] + x[i + 3] * x[i + 3];
+		}
+
+		for (i = n - (n % 4); i < n; i++)
+			_sum += x[i] * x[i];
+
+		return _sum;
+	}
+}
+
+template< typename T, typename CType >
+inline T nse::sqr_sum_ifeq(const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+	int i;
+	T _sum = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _sum )
+	for (i = 0; i < n - (n % 4); i += 4) {
+		if (mask[i] == check) _sum += x[i] * x[i];
+		if (mask[i + 1] == check) _sum += x[i + 1] * x[i + 1];
+		if (mask[i + 2] == check) _sum += x[i + 2] * x[i + 2];
+		if (mask[i + 3] == check) _sum += x[i + 3] * x[i + 3];
+	}
+
+	for (i = n - (n % 4); i < n; i++)
+		if (mask[i] == check) _sum += x[i] * x[i];
+
+	return _sum;
+}
+
+template< nse::memType mem, typename T >
+inline void nse::sqr_sum_and_dp(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+	T* _RESTRICT sum, T* _RESTRICT dp)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::sqr_sum_and_dp(x, y, n, sum, dp);
+	else
+#endif
+	{	// memCPU //
+		int i;
+		T _dp = (T)0, _sum = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _dp, _sum )
+		for (i = 0; i < n - (n % 4); i += 4) {
+
+			_sum += x[i] * x[i] + x[i + 1] * x[i + 1] +
+				x[i + 2] * x[i + 2] + x[i + 3] * x[i + 3];
+
+			_dp += x[i] * y[i] + x[i + 1] * y[i + 1] +
+				x[i + 2] * y[i + 2] + x[i + 3] * y[i + 3];
+		}
+
+		for (i = n - (n % 4); i < n; i++) {
+			_sum += x[i] * x[i];
+			_dp += x[i] * y[i];
+		}
+
+		(*dp) = _dp;
+		(*sum) = _sum;
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::sqr_sum_and_dp_ifeq(
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	T* _RESTRICT sum, T* _RESTRICT dp)
+{
+	int i;
+	T _dp = (T)0, _sum = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _dp, _sum )
+	for (i = 0; i < n - (n % 4); i += 4) {
+
+		if (mask[i] == check) {
+			_sum += x[i] * x[i];
+			_dp += x[i] * y[i];
+		}
+		if (mask[i + 1] == check) {
+			_sum += x[i + 1] * x[i + 1];
+			_dp += x[i + 1] * y[i + 1];
+		}
+		if (mask[i + 2] == check) {
+			_sum += x[i + 2] * x[i + 2];
+			_dp += x[i + 2] * y[i + 2];
+		}
+		if (mask[i + 3] == check) {
+			_sum += x[i + 3] * x[i + 3];
+			_dp += x[i + 3] * y[i + 3];
+		}
+	}
+
+	for (i = n - (n % 4); i < n; i++) {
+		if (mask[i] == check) {
+			_sum += x[i] * x[i];
+			_dp += x[i] * y[i];
+		}
+	}
+
+	(*dp) = _dp;
+	(*sum) = _sum;
+}
+
+template< nse::memType mem, typename T >
+inline T nse::lnorm(
+	const T* _RESTRICT const x, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) return nse_gpu::lnorm(x, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+		T _norm = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _norm )
+		for (i = 0; i < n - (n % 4); i += 4) {
+			_norm += x[i] * x[i] + x[i + 1] * x[i + 1] +
+				x[i + 2] * x[i + 2] + x[i + 3] * x[i + 3];
+		}
+
+		for (i = n - (n % 4); i < n; i++)
+			_norm += x[i] * x[i];
+
+		return sqrt(_norm);
+	}
+}
+
+template< typename T, typename CType >
+inline T nse::lnorm_ifeq(const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+	int i;
+	T _norm = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _norm )
+	for (i = 0; i < n - (n % 4); i += 4) {
+		if (mask[i] == check) _norm += x[i] * x[i];
+		if (mask[i + 1] == check) _norm += x[i + 1] * x[i + 1];
+		if (mask[i + 2] == check) _norm += x[i + 2] * x[i + 2];
+		if (mask[i + 3] == check) _norm += x[i + 3] * x[i + 3];
+	}
+
+	for (i = n - (n % 4); i < n; i++)
+		if (mask[i] == check) _norm += x[i] * x[i];
+
+	return sqrt(_norm);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::lnorm_and_dp(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::lnorm_and_dp(x, y, n, norm, dp);
+	else
+#endif
+	{	// memCPU //
+		int i;
+		T _dp = (T)0, _norm = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _dp, _norm )
+		for (i = 0; i < n - (n % 4); i += 4) {
+
+			_norm += x[i] * x[i] + x[i + 1] * x[i + 1] +
+				x[i + 2] * x[i + 2] + x[i + 3] * x[i + 3];
+
+			_dp += x[i] * y[i] + x[i + 1] * y[i + 1] +
+				x[i + 2] * y[i + 2] + x[i + 3] * y[i + 3];
+		}
+
+		for (i = n - (n % 4); i < n; i++) {
+			_norm += x[i] * x[i];
+			_dp += x[i] * y[i];
+		}
+
+		(*dp) = _dp;
+		(*norm) = sqrt(_norm);
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::lnorm_and_dp_ifeq(
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+	int i;
+	T _dp = (T)0, _norm = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _dp, _norm )
+	for (i = 0; i < n - (n % 4); i += 4) {
+
+		if (mask[i] == check) {
+			_norm += x[i] * x[i];
+			_dp += x[i] * y[i];
+		}
+		if (mask[i + 1] == check) {
+			_norm += x[i + 1] * x[i + 1];
+			_dp += x[i + 1] * y[i + 1];
+		}
+		if (mask[i + 2] == check) {
+			_norm += x[i + 2] * x[i + 2];
+			_dp += x[i + 2] * y[i + 2];
+		}
+		if (mask[i + 3] == check) {
+			_norm += x[i + 3] * x[i + 3];
+			_dp += x[i + 3] * y[i + 3];
+		}
+	}
+
+	for (i = n - (n % 4); i < n; i++) {
+		if (mask[i] == check) {
+			_norm += x[i] * x[i];
+			_dp += x[i] * y[i];
+		}
+	}
+
+	(*dp) = _dp;
+	(*norm) = sqrt(_norm);
+}
+
+template< nse::memType mem, typename T >
+inline void nse::lnorm_and_sqr_sum(
+	const T* _RESTRICT const x, const int n,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::lnorm_and_sqr_sum(x, n, norm, sum);
+	else
+#endif
+	{	// memCPU //
+		(*sum) = sqr_sum(x, n);
+		(*norm) = sqrt((*sum));
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::lnorm_and_sqr_sum_ifeq(
+	const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+	(*sum) = sqr_sum_ifeq(x, mask, check, n);
+	(*norm) = sqrt((*sum));
+}
+
+template< nse::memType mem, typename T >
+inline T nse::cnorm(
+	const T* _RESTRICT const x, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) return nse_gpu::cnorm(x, n);
+	else
+#endif
+	{	// memCPU //
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+		int i;
+		T _norm = (T)0, _norm_local;
+
+#pragma omp parallel private(_norm_local, i) shared(_norm)
+		{
+			_norm_local = (T)0;
+#pragma omp for nowait
+			for (i = 0; i < n; i++) {
+				if (fabs(x[i]) > _norm_local) _norm_local = fabs(x[i]);
+			}
+
+#pragma omp critical
+			{
+				if (_norm_local > _norm) _norm = _norm_local;
+			}
+		}
+
+		return _norm;
+#else
+		int i;
+		T _norm1 = (T)0, _norm2 = (T)0,
+			_norm3 = (T)0, _norm4 = (T)0;
+
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i) reduction(max:_norm1,_norm2,_norm3,_norm4)
+#endif
+		for (i = 0; i < n - (n % 4); i += 4) {
+
+			if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+			if (fabs(x[i + 1]) > _norm2) _norm2 = fabs(x[i + 1]);
+			if (fabs(x[i + 2]) > _norm3) _norm3 = fabs(x[i + 2]);
+			if (fabs(x[i + 3]) > _norm4) _norm4 = fabs(x[i + 3]);
+		}
+
+		for (i = n - (n % 4); i < n; i++) {
+			if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+		}
+
+		if (_norm1 < _norm3) _norm1 = _norm3;
+		if (_norm2 < _norm4) _norm2 = _norm4;
+
+		return (_norm1 > _norm2) ? _norm1 : _norm2;
+#endif
+	}
+}
+
+template< typename T, typename CType >
+inline T nse::cnorm_ifeq(const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+	int i;
+	T _norm = (T)0, _norm_local;
+
+#pragma omp parallel private(_norm_local, i) shared(_norm)
+	{
+		_norm_local = (T)0;
+#pragma omp for nowait
+		for (i = 0; i < n; i++) {
+			if (mask[i] == check) if (fabs(x[i]) > _norm_local) _norm_local = fabs(x[i]);
+		}
+
+#pragma omp critical
+		{
+			if (_norm_local > _norm) _norm = _norm_local;
+		}
+	}
+
+	return _norm;
+#else
+	int i;
+	T _norm1 = (T)0, _norm2 = (T)0,
+		_norm3 = (T)0, _norm4 = (T)0;
+
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i) reduction(max:_norm1,_norm2,_norm3,_norm4)
+#endif
+	for (i = 0; i < n - (n % 4); i += 4) {
+
+		if (mask[i] == check) if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+		if (mask[i + 1] == check) if (fabs(x[i + 1]) > _norm2) _norm2 = fabs(x[i + 1]);
+		if (mask[i + 2] == check) if (fabs(x[i + 2]) > _norm3) _norm3 = fabs(x[i + 2]);
+		if (mask[i + 3] == check) if (fabs(x[i + 3]) > _norm4) _norm4 = fabs(x[i + 3]);
+	}
+
+	for (i = n - (n % 4); i < n; i++) {
+		if (mask[i] == check) if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+	}
+
+	if (_norm1 < _norm3) _norm1 = _norm3;
+	if (_norm2 < _norm4) _norm2 = _norm4;
+
+	return (_norm1 > _norm2) ? _norm1 : _norm2;
+#endif
+}
+
+template< nse::memType mem, typename T >
+inline void nse::cnorm_and_dp(
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::cnorm_and_dp(x, y, n, norm, dp);
+	else
+#endif
+	{	// memCPU //
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+		int i;
+		T _norm = (T)0, _dp = (T)0, _norm_local;
+
+#pragma omp parallel private(_norm_local, i) shared(_norm, _dp)
+		{
+			_norm_local = (T)0;
+#pragma omp for reduction(+:_dp) nowait
+			for (i = 0; i < n; i++) {
+				_dp += x[i] * y[i];
+				if (fabs(x[i]) > _norm_local) _norm_local = fabs(x[i]);
+			}
+
+#pragma omp critical
+			{
+				if (_norm_local > _norm) _norm = _norm_local;
+			}
+		}
+
+		(*norm) = _norm;
+		(*dp) = _dp;
+#else
+		int i;
+		T _dp = (T)0,
+			_norm1 = (T)0, _norm2 = (T)0,
+			_norm3 = (T)0, _norm4 = (T)0;
+
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i) reduction(max:_norm1,_norm2,_norm3,_norm4) reduction(+:_dp)
+#endif
+		for (i = 0; i < n - (n % 4); i += 4) {
+
+			_dp += x[i] * y[i] + x[i + 1] * y[i + 1] +
+				x[i + 2] * y[i + 2] + x[i + 3] * y[i + 3];
+
+			if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+			if (fabs(x[i + 1]) > _norm2) _norm2 = fabs(x[i + 1]);
+			if (fabs(x[i + 2]) > _norm3) _norm3 = fabs(x[i + 2]);
+			if (fabs(x[i + 3]) > _norm4) _norm4 = fabs(x[i + 3]);
+		}
+
+		for (i = n - (n % 4); i < n; i++) {
+			_dp += x[i] * y[i];
+			if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+		}
+
+		if (_norm1 < _norm3) _norm1 = _norm3;
+		if (_norm2 < _norm4) _norm2 = _norm4;
+
+		(*norm) = (_norm1 > _norm2) ? _norm1 : _norm2;
+		(*dp) = _dp;
+#endif
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::cnorm_and_dp_ifeq(const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	T* _RESTRICT norm, T* _RESTRICT dp)
+{
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+	int i;
+	T _norm = (T)0, _dp = (T)0, _norm_local;
+
+#pragma omp parallel private(_norm_local, i) shared(_norm, _dp)
+	{
+		_norm_local = (T)0;
+#pragma omp for reduction(+:_dp) nowait
+		for (i = 0; i < n; i++) {
+			if (mask[i] == check) {
+				_dp += x[i] * y[i];
+				if (fabs(x[i]) > _norm_local) _norm_local = fabs(x[i]);
+			}
+		}
+
+#pragma omp critical
+		{
+			if (_norm_local > _norm) _norm = _norm_local;
+		}
+	}
+
+	(*norm) = _norm;
+	(*dp) = _dp;
+#else
+	int i;
+	T _dp = (T)0,
+		_norm1 = (T)0, _norm2 = (T)0,
+		_norm3 = (T)0, _norm4 = (T)0;
+
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i) reduction(max:_norm1,_norm2,_norm3,_norm4) reduction(+:_dp)
+#endif
+	for (i = 0; i < n - (n % 4); i += 4) {
+
+		if (mask[i] == check) {
+			_dp += x[i] * y[i];
+			if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+		}
+		
+		if (mask[i + 1] == check) {
+			_dp += x[i + 1] * y[i + 1];
+			if (fabs(x[i + 1]) > _norm2) _norm2 = fabs(x[i + 1]);
+		}
+		
+		if (mask[i + 2] == check) {
+			_dp += x[i + 2] * y[i + 2];
+			if (fabs(x[i + 2]) > _norm3) _norm3 = fabs(x[i + 2]);
+		}
+		
+		if (mask[i + 3] == check) {
+			_dp += x[i + 3] * y[i + 3];
+			if (fabs(x[i + 3]) > _norm4) _norm4 = fabs(x[i + 3]);
+		}
+	}
+
+	for (i = n - (n % 4); i < n; i++) {
+		if (mask[i] == check) {
+			_dp += x[i] * y[i];
+			if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+		}
+	}
+
+	if (_norm1 < _norm3) _norm1 = _norm3;
+	if (_norm2 < _norm4) _norm2 = _norm4;
+
+	(*norm) = (_norm1 > _norm2) ? _norm1 : _norm2;
+	(*dp) = _dp;
+#endif
+}
+
+template< nse::memType mem, typename T >
+inline void nse::cnorm_and_sqr_sum(
+	const T* _RESTRICT const x, const int n,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::cnorm_and_sqr_sum(x, n, norm, sum);
+	else
+#endif
+	{	// memCPU //
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+		int i;
+		T _norm = (T)0, _sum = (T)0, _norm_local;
+
+#pragma omp parallel private(_norm_local, i) shared(_norm, _sum)
+		{
+			_norm_local = (T)0;
+#pragma omp for reduction(+:_sum) nowait
+			for (i = 0; i < n; i++) {
+				_sum += x[i] * x[i];
+				if (fabs(x[i]) > _norm_local) _norm_local = fabs(x[i]);
+			}
+
+#pragma omp critical
+			{
+				if (_norm_local > _norm) _norm = _norm_local;
+			}
+		}
+
+		(*norm) = _norm;
+		(*sum) = _sum;
+#else
+		int i;
+		T _sum = (T)0,
+			_norm1 = (T)0, _norm2 = (T)0,
+			_norm3 = (T)0, _norm4 = (T)0;
+
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i) reduction(max:_norm1,_norm2,_norm3,_norm4) reduction(+:_sum)
+#endif
+		for (i = 0; i < n - (n % 4); i += 4) {
+
+			_sum += x[i] * x[i] + x[i + 1] * x[i + 1] +
+				x[i + 2] * x[i + 2] + x[i + 3] * x[i + 3];
+
+			if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+			if (fabs(x[i + 1]) > _norm2) _norm2 = fabs(x[i + 1]);
+			if (fabs(x[i + 2]) > _norm3) _norm3 = fabs(x[i + 2]);
+			if (fabs(x[i + 3]) > _norm4) _norm4 = fabs(x[i + 3]);
+		}
+
+		for (i = n - (n % 4); i < n; i++) {
+			_sum += x[i] * x[i];
+			if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+		}
+
+		if (_norm1 < _norm3) _norm1 = _norm3;
+		if (_norm2 < _norm4) _norm2 = _norm4;
+
+		(*norm) = (_norm1 > _norm2) ? _norm1 : _norm2;
+		(*sum) = _sum;
+#endif
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::cnorm_and_sqr_sum_ifeq(const T* _RESTRICT const x,
+	const CType* _RESTRICT const mask, const CType check, const int n,
+	T* _RESTRICT norm, T* _RESTRICT sum)
+{
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+	int i;
+	T _norm = (T)0, _sum = (T)0, _norm_local;
+
+#pragma omp parallel private(_norm_local, i) shared(_norm, _sum)
+	{
+		_norm_local = (T)0;
+#pragma omp for reduction(+:_sum) nowait
+		for (i = 0; i < n; i++) {
+			if (mask[i] == check) {
+				_sum += x[i] * x[i];
+				if (fabs(x[i]) > _norm_local) _norm_local = fabs(x[i]);
+			}
+		}
+
+#pragma omp critical
+		{
+			if (_norm_local > _norm) _norm = _norm_local;
+		}
+	}
+
+	(*norm) = _norm;
+	(*sum) = _sum;
+#else
+	int i;
+	T _sum = (T)0,
+		_norm1 = (T)0, _norm2 = (T)0,
+		_norm3 = (T)0, _norm4 = (T)0;
+
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i) reduction(max:_norm1,_norm2,_norm3,_norm4) reduction(+:_sum)
+#endif
+	for (i = 0; i < n - (n % 4); i += 4) {
+
+		if (mask[i] == check) {
+			_sum += x[i] * x[i];
+			if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+		}
+
+		if (mask[i + 1] == check) {
+			_sum += x[i + 1] * x[i + 1];
+			if (fabs(x[i + 1]) > _norm2) _norm2 = fabs(x[i + 1]);
+		}
+
+		if (mask[i + 2] == check) {
+			_sum += x[i + 2] * x[i + 2];
+			if (fabs(x[i + 2]) > _norm3) _norm3 = fabs(x[i + 2]);
+		}
+
+		if (mask[i + 3] == check) {
+			_sum += x[i + 3] * x[i + 3];
+			if (fabs(x[i + 3]) > _norm4) _norm4 = fabs(x[i + 3]);
+		}
+	}
+
+	for (i = n - (n % 4); i < n; i++) {
+		if (mask[i] == check) {
+			_sum += x[i] * x[i];
+			if (fabs(x[i]) > _norm1) _norm1 = fabs(x[i]);
+		}
+	}
+
+	if (_norm1 < _norm3) _norm1 = _norm3;
+	if (_norm2 < _norm4) _norm2 = _norm4;
+
+	(*norm) = (_norm1 > _norm2) ? _norm1 : _norm2;
+	(*sum) = _sum;
+#endif
+}
+
+template< nse::memType mem, typename T >
+inline T nse::l1norm(
+	const T* _RESTRICT const x, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) return nse_gpu::l1norm(x, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+		T _norm = (T)0;
+
+#pragma omp parallel for private( i ) reduction( + : _norm )
+		for (i = 0; i < n - (n % 4); i += 4) {
+			_norm += fabs(x[i]) +
+				fabs(x[i + 1]) +
+				fabs(x[i + 2]) +
+				fabs(x[i + 3]);
+		}
+
+		for (i = n - (n % 4); i < n; i++)
+			_norm += fabs(x[i]);
+
+		return _norm;
+	}
+}
+// ----------------------------------------------------------------------------------------- //
+
+template< nse::memType mem, typename T >
+inline void nse::null(
+	T* _RESTRICT x, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) nse_gpu::null(x, n);
+	else
+#endif
+	{	// memCPU //
+#pragma omp parallel shared( x )
+		{
+			int i;
+
+#pragma omp for nowait
+			for (i = 0; i < n - (n % 4); i += 4) {
+				x[i] = (T)0;
+				x[i + 1] = (T)0;
+				x[i + 2] = (T)0;
+				x[i + 3] = (T)0;
+			}
+
+#pragma omp single nowait
+			for (i = n - (n % 4); i < n; i++)
+				x[i] = (T)0;
+		}
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::null_ifeq(T* _RESTRICT x,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+#pragma omp parallel shared( x )
+	{
+		int i;
+
+#pragma omp for nowait
+		for (i = 0; i < n - (n % 4); i += 4) {
+			if (mask[i] == check) x[i] = (T)0;
+			if (mask[i + 1] == check) x[i + 1] = (T)0;
+			if (mask[i + 2] == check) x[i + 2] = (T)0;
+			if (mask[i + 3] == check) x[i + 3] = (T)0;
+		}
+
+#pragma omp single nowait
+		for (i = n - (n % 4); i < n; i++)
+			if (mask[i] == check) x[i] = (T)0;
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::update(
+	T* _RESTRICT x,
+	const T alpha, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) nse_gpu::update(x, alpha, n);
+	else
+#endif
+	{	// memCPU //
+#pragma omp parallel shared( x )
+		{
+			int i;
+
+#pragma omp for nowait
+			for (i = 0; i < n - (n % 4); i += 4) {
+				x[i] += alpha;
+				x[i + 1] += alpha;
+				x[i + 2] += alpha;
+				x[i + 3] += alpha;
+			}
+
+#pragma omp single nowait
+			for (i = n - (n % 4); i < n; i++)
+				x[i] += alpha;
+		}
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::update_ifeq(T* _RESTRICT x,
+	const T alpha,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+#pragma omp parallel shared( x )
+	{
+		int i;
+
+#pragma omp for nowait
+		for (i = 0; i < n - (n % 4); i += 4) {
+			if (mask[i] == check) x[i] += alpha;
+			if (mask[i + 1] == check) x[i + 1] += alpha;
+			if (mask[i + 2] == check) x[i + 2] += alpha;
+			if (mask[i + 3] == check) x[i + 3] += alpha;
+		}
+
+#pragma omp single nowait
+		for (i = n - (n % 4); i < n; i++)
+			if (mask[i] == check) x[i] += alpha;
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::update(
+	T* _RESTRICT x,
+	const T* _RESTRICT const y, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) nse_gpu::update(x, y, n);
+	else
+#endif
+	{	// memCPU //
+#pragma omp parallel shared( x )
+		{
+			int i;
+
+#pragma omp for nowait
+			for (i = 0; i < n - (n % 4); i += 4) {
+				x[i] += y[i];
+				x[i + 1] += y[i + 1];
+				x[i + 2] += y[i + 2];
+				x[i + 3] += y[i + 3];
+			}
+
+#pragma omp single nowait
+			for (i = n - (n % 4); i < n; i++)
+				x[i] += y[i];
+		}
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::update(
+	T* _RESTRICT x,
+	const T alpha, const T* _RESTRICT const y, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::update(x, alpha, y, n);
+	else
+#endif
+	{	// memCPU //
+#pragma omp parallel shared( x )
+		{
+			int i;
+
+#pragma omp for nowait
+			for (i = 0; i < n - (n % 4); i += 4) {
+				x[i] += alpha * y[i];
+				x[i + 1] += alpha * y[i + 1];
+				x[i + 2] += alpha * y[i + 2];
+				x[i + 3] += alpha * y[i + 3];
+			}
+
+#pragma omp single nowait
+			for (i = n - (n % 4); i < n; i++)
+				x[i] += alpha * y[i];
+		}
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::update_ifeq(T* _RESTRICT x,
+	const T alpha, const T* _RESTRICT const y,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+#pragma omp parallel shared( x )
+	{
+		int i;
+
+#pragma omp for nowait
+		for (i = 0; i < n - (n % 4); i += 4) {
+			if (mask[i] == check) x[i] += alpha * y[i];
+			if (mask[i + 1] == check) x[i + 1] += alpha * y[i + 1];
+			if (mask[i + 2] == check) x[i + 2] += alpha * y[i + 2];
+			if (mask[i + 3] == check) x[i + 3] += alpha * y[i + 3];
+		}
+
+#pragma omp single nowait
+		for (i = n - (n % 4); i < n; i++)
+			if (mask[i] == check) x[i] += alpha * y[i];
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::update(
+	T* _RESTRICT x,
+	const T alpha, const T* _RESTRICT const y,
+	const T beta, const T* _RESTRICT const z, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::update(x, alpha, y, beta, z, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+
+#pragma omp parallel for private( i ) shared( x )
+		for (i = 0; i < n; i++)
+			x[i] += alpha * y[i] + beta * z[i];
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::update(
+	T* _RESTRICT x,
+	const T alpha, const T* _RESTRICT const y,
+	const T beta, const T* _RESTRICT const z,
+	const T gamma, const T* _RESTRICT const w, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::update(x, alpha, y, beta, z, gamma, w, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+
+#pragma omp parallel for private( i ) shared( x )
+		for (i = 0; i < n; i++)
+			x[i] += alpha * y[i] + beta * z[i] + gamma * w[i];
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::update(
+	T* _RESTRICT x, T* _RESTRICT y,
+	const T alpha, const T beta,
+	const T* _RESTRICT const z, const T* _RESTRICT const w, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::update(x, y, alpha, beta, z, w, n);
+	else
+#endif
+	{	// memCPU //
+#pragma omp parallel shared( x, y )
+		{
+			int i;
+
+#pragma omp for nowait
+			for (i = 0; i < n - (n % 4); i += 4)
+			{
+				x[i] += alpha * z[i];
+				x[i + 1] += alpha * z[i + 1];
+				x[i + 2] += alpha * z[i + 2];
+				x[i + 3] += alpha * z[i + 3];
+			}
+
+#pragma omp single nowait
+			for (i = n - (n % 4); i < n; i++)
+				x[i] += alpha * z[i];
+
+#pragma omp for nowait
+			for (i = 0; i < n - (n % 4); i += 4)
+			{
+				y[i] += beta * w[i];
+				y[i + 1] += beta * w[i + 1];
+				y[i + 2] += beta * w[i + 2];
+				y[i + 3] += beta * w[i + 3];
+			}
+
+#pragma omp single nowait
+			for (i = n - (n % 4); i < n; i++)
+				y[i] += beta * w[i];
+		}
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::update_ifeq(T* _RESTRICT x, T* _RESTRICT y,
+	const T alpha, const T beta,
+	const T* _RESTRICT const z, const T* _RESTRICT const w,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+#pragma omp parallel shared( x, y )
+	{
+		int i;
+
+#pragma omp for nowait
+		for (i = 0; i < n - (n % 4); i += 4)
+		{
+			if (mask[i] == check) x[i] += alpha * z[i];
+			if (mask[i + 1] == check) x[i + 1] += alpha * z[i + 1];
+			if (mask[i + 2] == check) x[i + 2] += alpha * z[i + 2];
+			if (mask[i + 3] == check) x[i + 3] += alpha * z[i + 3];
+		}
+
+#pragma omp single nowait
+		for (i = n - (n % 4); i < n; i++)
+			if (mask[i] == check) x[i] += alpha * z[i];
+
+#pragma omp for nowait
+		for (i = 0; i < n - (n % 4); i += 4)
+		{
+			if (mask[i] == check) y[i] += beta * w[i];
+			if (mask[i + 1] == check) y[i + 1] += beta * w[i + 1];
+			if (mask[i + 2] == check) y[i + 2] += beta * w[i + 2];
+			if (mask[i + 3] == check) y[i + 3] += beta * w[i + 3];
+		}
+
+#pragma omp single nowait
+		for (i = n - (n % 4); i < n; i++)
+			if (mask[i] == check) y[i] += beta * w[i];
+	}
+}
+
+#ifdef USE_EXPLICIT_SSE
+inline void nse::update_sse(
+	float* _RESTRICT x, float* _RESTRICT y,
+	const float alpha, const float beta,
+	const float* _RESTRICT const z, const float* _RESTRICT const w, const int n)
+{
+	__m128 m_z, m_w, m_x, m_y;
+	__m128 m_alpha = _mm_set_ps1(alpha);
+	__m128 m_beta = _mm_set_ps1(beta);
+
+	int i;
+
+#pragma omp parallel for private( i, m_z, m_w, m_x, m_y ) shared( x, y, m_alpha, m_beta )
+	for (i = 0; i < n - (n % 4); i += 4) {
+		m_z = _mm_load_ps(&z[i]);
+		m_w = _mm_load_ps(&w[i]);
+		m_x = _mm_load_ps(&x[i]);
+		m_y = _mm_load_ps(&y[i]);
+
+		m_z = _mm_mul_ps(m_z, m_alpha);
+		m_w = _mm_mul_ps(m_w, m_beta);
+
+		m_x = _mm_add_ps(m_x, m_z);
+		m_y = _mm_add_ps(m_y, m_w);
+
+		_mm_store_ps(&x[i], m_x);
+		_mm_store_ps(&y[i], m_y);
+	}
+
+	for (i = n - (n % 4); i < n; i++) {
+		x[i] += alpha * z[i];
+		y[i] += beta * w[i];
+	}
+}
+
+inline void nse::update_sse(
+	double* _RESTRICT x, double* _RESTRICT y,
+	const double alpha, const double beta,
+	const double* _RESTRICT const z, const double* _RESTRICT const w, const int n)
+{
+	__m128d m_z, m_w, m_x, m_y;
+	__m128d m_alpha = _mm_set1_pd(alpha);
+	__m128d m_beta = _mm_set1_pd(beta);
+
+	int i;
+
+#pragma omp parallel for private( i, m_z, m_w, m_x, m_y ) shared( x, y, m_alpha, m_beta )
+	for (i = 0; i < n - (n % 2); i += 2) {
+		m_z = _mm_load_pd(&z[i]);
+		m_w = _mm_load_pd(&w[i]);
+		m_x = _mm_load_pd(&x[i]);
+		m_y = _mm_load_pd(&y[i]);
+
+		m_z = _mm_mul_pd(m_z, m_alpha);
+		m_w = _mm_mul_pd(m_w, m_beta);
+
+		m_x = _mm_add_pd(m_x, m_z);
+		m_y = _mm_add_pd(m_y, m_w);
+
+		_mm_store_pd(&x[i], m_x);
+		_mm_store_pd(&y[i], m_y);
+	}
+
+	for (i = n - (n % 2); i < n; i++) {
+		x[i] += alpha * z[i];
+		y[i] += beta * w[i];
+	}
+}
+#endif
+
+template< nse::memType mem, typename T >
+inline void nse::assign(
+	T* _RESTRICT x,
+	const T alpha, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::assign(x, alpha, n);
+	else
+#endif
+	{	// memCPU //
+#pragma omp parallel shared( x )
+		{
+			int i;
+
+#pragma omp for nowait
+			for (i = 0; i < n - (n % 4); i += 4) {
+				x[i] = alpha;
+				x[i + 1] = alpha;
+				x[i + 2] = alpha;
+				x[i + 3] = alpha;
+			}
+
+#pragma omp single nowait
+			for (i = n - (n % 4); i < n; i++)
+				x[i] = alpha;
+		}
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::assign(
+	T* _RESTRICT x,
+	const T alpha, const T* _RESTRICT const y, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::assign(x, alpha, y, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+
+#pragma omp parallel for private( i ) shared( x )
+		for (i = 0; i < n; i++)
+			x[i] = alpha * y[i];
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::assign(
+	T* _RESTRICT x,
+	const T alpha, const T* _RESTRICT const y,
+	const T beta, const T* _RESTRICT const z, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::assign(x, alpha, y, beta, z, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+
+#pragma omp parallel for private( i ) shared( x )
+		for (i = 0; i < n; i++)
+			x[i] = alpha * y[i] + beta * z[i];
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::assign_ifeq(T* _RESTRICT x,
+	const T alpha, const T* _RESTRICT const y,
+	const T beta, const T* _RESTRICT const z,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+	int i;
+
+#pragma omp parallel for private( i ) shared( x )
+	for (i = 0; i < n; i++)
+		if (mask[i] == check) x[i] = alpha * y[i] + beta * z[i];
+}
+
+template< nse::memType mem, typename T >
+inline void nse::assign(
+	T* _RESTRICT x,
+	const T alpha, const T* _RESTRICT const y,
+	const T beta, const T* _RESTRICT const z,
+	const T gamma, const T* _RESTRICT const w, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::assign(x, alpha, y, beta, z, gamma, w, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+
+#pragma omp parallel for private( i ) shared( x )
+		for (i = 0; i < n; i++)
+			x[i] = alpha * y[i] + beta * z[i] + gamma * w[i];
+	}
+}
+
+template< typename T, typename CType >
+inline void nse::assign_ifeq(T* _RESTRICT x,
+	const T alpha, const T* _RESTRICT const y,
+	const T beta, const T* _RESTRICT const z,
+	const T gamma, const T* _RESTRICT const w,
+	const CType* _RESTRICT const mask, const CType check, const int n)
+{
+	int i;
+
+#pragma omp parallel for private( i ) shared( x )
+	for (i = 0; i < n; i++)
+		if (mask[i] == check) x[i] = alpha * y[i] + beta * z[i] + gamma * w[i];
+}
+
+template< nse::memType mem, typename T >
+inline void nse::assign(
+	T* _RESTRICT x,
+	const T alpha, const T* _RESTRICT const y,
+	const T beta, const T* _RESTRICT const z,
+	const T gamma, const T* _RESTRICT const w, 
+	const T delta, const T* _RESTRICT const p, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::assign(x, alpha, y, beta, z, gamma, w, delta, p, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+
+#pragma omp parallel for private( i ) shared( x )
+		for (i = 0; i < n; i++)
+			x[i] = alpha * y[i] + beta * z[i] + gamma * w[i] + delta * p[i];
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::vsum(T* _RESTRICT sum,
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n)
+{
+	// memCPU //
+	int i;
+
+#pragma omp parallel for private( i ) shared( sum )
+	for (i = 0; i < n; i++)
+		sum[i] = x[i] + y[i];
+}
+
+template< nse::memType mem, typename T >
+inline void nse::vsum(T* _RESTRICT sum,
+	const T* _RESTRICT const x, const T* _RESTRICT const y, 
+	const T* _RESTRICT const z, const int n)
+{
+	// memCPU //
+	int i;
+
+#pragma omp parallel for private( i ) shared( sum )
+	for (i = 0; i < n; i++)
+		sum[i] = x[i] + y[i] + z[i];
+}
+
+template< nse::memType mem, typename T >
+inline void nse::vsum(T* _RESTRICT sum,
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const T* _RESTRICT const z, const T* _RESTRICT const p, const int n)
+{
+	// memCPU //
+	int i;
+
+#pragma omp parallel for private( i ) shared( sum )
+	for (i = 0; i < n; i++)
+		sum[i] = x[i] + y[i] + z[i] + p[i];
+}
+
+template< nse::memType mem, typename T >
+inline void nse::vsum(T* _RESTRICT sum,
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const T* _RESTRICT const z, const T* _RESTRICT const p, 
+	const T* _RESTRICT const q, const int n)
+{
+	// memCPU //
+	int i;
+
+#pragma omp parallel for private( i ) shared( sum )
+	for (i = 0; i < n; i++)
+		sum[i] = x[i] + y[i] + z[i] + p[i] + q[i];
+}
+
+template< nse::memType mem, typename T >
+inline void nse::vsum(T* _RESTRICT sum,
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const T* _RESTRICT const z, const T* _RESTRICT const p,
+	const T* _RESTRICT const q, const T* _RESTRICT const w, const int n)
+{
+	// memCPU //
+	int i;
+
+#pragma omp parallel for private( i ) shared( sum )
+	for (i = 0; i < n; i++)
+		sum[i] = x[i] + y[i] + z[i] + p[i] + q[i] + w[i];
+}
+
+template< nse::memType mem, typename T >
+inline void nse::vsum(T* _RESTRICT sum,
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const T* _RESTRICT const z, const T* _RESTRICT const p,
+	const T* _RESTRICT const q, const T* _RESTRICT const w, 
+	const T* _RESTRICT const v, const int n)
+{
+	// memCPU //
+	int i;
+
+#pragma omp parallel for private( i ) shared( sum )
+	for (i = 0; i < n; i++)
+		sum[i] = x[i] + y[i] + z[i] + p[i] + q[i] + w[i] + v[i];
+}
+
+template< nse::memType mem, typename T >
+inline void nse::vsub(T* _RESTRICT sub,
+	const T* _RESTRICT const x, const T* _RESTRICT const y, const int n)
+{
+	// memCPU //
+	int i;
+
+#pragma omp parallel for private( i ) shared( sub )
+	for (i = 0; i < n; i++)
+		sub[i] = x[i] - y[i];
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mul(
+	T* _RESTRICT x, const T value, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::mul(x, value, n);
+	else
+#endif
+	{	// memCPU //
+#pragma omp parallel shared( x )
+		{
+			int i;
+
+#pragma omp for nowait
+			for (i = 0; i < n - (n % 4); i += 4) {
+				x[i] *= value;
+				x[i + 1] *= value;
+				x[i + 2] *= value;
+				x[i + 3] *= value;
+			}
+
+#pragma omp single nowait
+			for (i = n - (n % 4); i < n; i++)
+				x[i] *= value;
+		}
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mul(
+	T* _RESTRICT y,
+	const T* _RESTRICT const x, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::mul(y, x, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+
+#pragma omp parallel for private( i ) shared( y )
+		for (i = 0; i < n; i++)
+			y[i] *= x[i];
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::mul(
+	T* _RESTRICT y,
+	const T* _RESTRICT const x, const T* _RESTRICT const z, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU)
+		nse_gpu::mul(y, x, z, n);
+	else
+#endif
+	{	// memCPU //
+		int i;
+
+#pragma omp parallel for private( i ) shared( y )
+		for (i = 0; i < n; i++)
+			y[i] = x[i] * z[i];
+	}
+}
+
+
+template< nse::memType mem, typename T >
+inline void nse::vdiv(
+	T* _RESTRICT y,
+	const T* _RESTRICT const x, const T* _RESTRICT const z, const int n)
+{
+	// memCPU //
+	int i;
+
+#pragma omp parallel for private( i ) shared( y )
+	for (i = 0; i < n; i++)
+		y[i] = x[i] / z[i];
+}
+// ----------------------------------------------------------------------------------------- //
+
+template< typename T >
+inline void nse::variance(
+	T* _RESTRICT varx,
+	const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n)
+{
+	int i;
+
+#pragma omp parallel for private( i ) shared( varx )
+	for (i = 0; i < n; i++)
+		varx[i] = sqrx[i] - x[i] * x[i];
+}
+
+template< typename T >
+inline void nse::deviation(
+	T* _RESTRICT sigma,
+	const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n)
+{
+	int i;
+
+#pragma omp parallel for private( i ) shared( sigma )
+	for (i = 0; i < n; i++)
+		sigma[i] = sqrt(sqrx[i] - x[i] * x[i]);
+}
+
+template< typename T >
+inline void nse::deviation_abs(
+	T* _RESTRICT sigma,
+	const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n)
+{
+	int i;
+
+#pragma omp parallel for private( i ) shared( sigma )
+	for (i = 0; i < n; i++)
+		sigma[i] = sqrt(fabs(sqrx[i] - x[i] * x[i]));
+}
+
+template< typename T >
+inline void nse::deviation(
+	T* _RESTRICT sigma,
+	const T* _RESTRICT const varx, const int n)
+{
+	int i;
+
+#pragma omp parallel for private( i ) shared( sigma )
+	for (i = 0; i < n; i++)
+		sigma[i] = sqrt(varx[i]);
+}
+
+template< typename T >
+inline T nse::max_deviation(
+	const T* _RESTRICT const x, const T* _RESTRICT const sqrx, const int n)
+{
+	if (n <= 0) return (T)0;
+
+	int i;
+	T _max = (T)0, sigma;
+
+#if defined(USE_OPENMP20_IN_MINMAX) && !defined(USE_AS_OPENMP31)
+	T _max_local = (T)0;
+
+#pragma omp parallel firstprivate(_max_local) private(i, sigma) shared(_max)
+	{
+#pragma omp for nowait
+		for (i = 0; i < n; i++) {
+			sigma = sqrt(sqrx[i] - x[i] * x[i]);
+			if (sigma > _max_local) _max_local = sigma;
+		}
+
+#pragma omp critical
+		{
+			if (_max_local > _max) _max = _max_local;
+		}
+	}
+#else
+
+#ifdef USE_AS_OPENMP31
+#pragma omp parallel for private(i, sigma) reduction(max:_max)
+#endif
+	for (i = 0; i < n; i++) {
+		sigma = sqrt(sqrx[i] - x[i] * x[i]);
+		if (sigma > _max) _max = sigma;
+	}
+
+#endif
+
+	return _max;
+}
+
+template< typename T >
+inline void nse::covariance(
+	T* _RESTRICT cov,
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const T* _RESTRICT const xy, const int n)
+{
+	int i;
+
+#pragma omp parallel for private( i ) shared( cov )
+	for (i = 0; i < n; i++)
+		cov[i] = xy[i] - x[i] * y[i];
+}
+
+template< typename T >
+inline void nse::skewness(
+	T* _RESTRICT skew,
+	const T* _RESTRICT const x, const T* _RESTRICT const x2,
+	const T* _RESTRICT const x3, const int n)
+{
+	int i;
+
+#pragma omp parallel for private( i ) shared( skew )
+	for (i = 0; i < n; i++)
+		skew[i] = x3[i] - (T)3.0 * x[i] * x2[i] + (T)2.0 * x[i] * x[i] * x[i];
+}
+
+template< typename T >
+inline void nse::coskewness(
+	T* _RESTRICT coskew,
+	const T* _RESTRICT const x, const T* _RESTRICT const y,
+	const T* _RESTRICT const x2, const T* _RESTRICT const xy,
+	const T* _RESTRICT const x2y, const int n)
+{
+	int i;
+
+#pragma omp parallel for private( i ) shared( coskew )
+	for (i = 0; i < n; i++)
+		coskew[i] = x2y[i] - x2[i] * y[i]
+		- (T)2.0 * x[i] * xy[i] + (T)2.0 * x[i] * x[i] * y[i];
+}
+// ----------------------------------------------------------------------------------------- //
+
+template< nse::memType mem, typename T >
+inline void nse::matvec(
+	T* _RESTRICT y,
+	const T* _RESTRICT const matrix, const T* _RESTRICT const x, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) {
+		nse_gpu::matvec(y, matrix, x, n);
+		return;
+	}
+#endif
+
+	int i, j, idx;
+
+#pragma omp parallel for private( i, j, idx ) shared( y )
+	for (i = 0; i < n; i++) {
+		y[i] = (T)0;
+
+		idx = i * n;
+		for (j = 0; j < n; j++, idx++)
+			y[i] += matrix[idx] * x[j];
+	}
+}
+
+template< typename T >
+inline void nse::vecmat(
+	T* _RESTRICT y,
+	const T* _RESTRICT const x, const T* _RESTRICT const matrix, const int n)
+{
+	int i, j, idx;
+
+#pragma omp parallel for private( i, j, idx ) shared( y )
+	for (i = 0; i < n; i++) {
+		y[i] = (T)0;
+
+		idx = i;
+		for (j = 0; j < n; j++, idx += n)
+			y[i] += matrix[idx] * x[j];
+	}
+}
+
+template< nse::memType mem, typename T >
+inline void nse::resvec(
+	T* _RESTRICT res,
+	const T* _RESTRICT const rhs, const T* _RESTRICT const matrix,
+	const T* _RESTRICT const x, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) {
+		nse_gpu::resvec(res, rhs, matrix, x, n);
+		return;
+	}
+#endif
+
+	int i, j, idx;
+
+#pragma omp parallel for private( i, j, idx ) shared( res )
+	for (i = 0; i < n; i++) {
+		res[i] = rhs[i];
+
+		idx = i * n;
+		for (j = 0; j < n; j++, idx++)
+			res[i] -= matrix[idx] * x[j];
+	}
+}
+// ----------------------------------------------------------------------------------------- //
+
+// * matrix operations * //
+// --------------------- //
+template< typename T >
+inline T nse::det(
+	const T a11, const T a12,
+	const T a21, const T a22)
+{
+	return a11 * a22 - a12 * a21;
+}
+
+template< typename T >
+inline T nse::det(
+	const T a11, const T a12, const T a13,
+	const T a21, const T a22, const T a23,
+	const T a31, const T a32, const T a33)
+{
+	return a11 * (a22 * a33 - a23 * a32)
+		- a12 * (a21 * a33 - a23 * a31)
+		+ a13 * (a21 * a32 - a22 * a31);
+}
+
+template< typename T >
+inline T nse::det(
+	const T a11, const T a12, const T a13, const T a14,
+	const T a21, const T a22, const T a23, const T a24,
+	const T a31, const T a32, const T a33, const T a34,
+	const T a41, const T a42, const T a43, const T a44)
+{
+	return	(a11 * a22 - a12 * a21) * (a33 * a44 - a34 * a43)
+		- (a11 * a23 - a13 * a21) * (a32 * a44 - a34 * a42)
+		+ (a11 * a24 - a14 * a21) * (a32 * a43 - a33 * a42)
+		+ (a12 * a23 - a13 * a22) * (a31 * a44 - a34 * a41)
+		- (a12 * a24 - a14 * a22) * (a31 * a43 - a33 * a41)
+		+ (a13 * a24 - a14 * a23) * (a31 * a42 - a32 * a41);
+}
+
+template< typename T >
+T nse::det( // matrix[n * n] determinant
+	const T* _RESTRICT const matrix, const int n)
+{
+	if (n <= 0) return (T)0;
+	if (n == 1) return (*matrix);
+
+	if (n == 2)
+		return det(matrix[0], matrix[1],
+		matrix[2], matrix[3]);
+
+	if (n == 3)
+		return det(matrix[0], matrix[1], matrix[2],
+		matrix[3], matrix[4], matrix[5],
+		matrix[6], matrix[7], matrix[8]);
+
+	if (n == 4)
+		return det(matrix[0], matrix[1], matrix[2], matrix[3],
+		matrix[4], matrix[5], matrix[6], matrix[7],
+		matrix[8], matrix[9], matrix[10], matrix[11],
+		matrix[12], matrix[13], matrix[14], matrix[15]);
+
+	T *sub;
+	allocate_vnull(&sub, (n - 1) * (n - 1));
+	int i, j, k, sub_idx;
+	T sub_det = (T)0, sub_sign;
+
+	for (k = 0; k < n; k++) {
+		for (i = 1; i < n; i++) {
+			sub_idx = (i - 1) * (n - 1);
+			for (j = 0; j < n; j++) {
+				if (j == k) continue;
+
+				sub[sub_idx] = matrix[i * n + j];
+				sub_idx++;
+			}
+		}
+		sub_sign = ((k % 2) == 0) ? (T) 1.0 : -(T) 1.0;
+		sub_det += sub_sign * matrix[k] * det(sub, n - 1);
+	}
+
+	deallocate(sub);
+	return sub_det;
+}
+
+template< typename T >
+T nse::cofactor( // matrix[n * n] cofactor(i, j) determinant
+	const int ic, const int jc, const T* _RESTRICT const matrix, const int n)
+{
+	if (n <= 0) return (T)0;
+
+	int i, j, k, sub_idx;
+
+	T *sub;
+	allocate_vnull(&sub, (n - 1) * (n - 1));
+	T sub_det = (T)0;
+	T sub_sign = (((ic + jc) % 2) == 0) ? (T) 1.0 : -(T) 1.0;
+
+	k = 0;
+	for (i = 0; i < n; i++) {
+		if ((i + 1) == ic) continue;
+
+		sub_idx = k * (n - 1);
+		for (j = 0; j < n; j++) {
+			if ((j + 1) == jc) continue;
+			sub[sub_idx] = matrix[i * n + j];
+			sub_idx++;
+		}
+		k++;
+	}
+	sub_det = det(sub, n - 1);
+
+	deallocate(sub);
+	return sub_sign * sub_det;
+}
+
+template< typename T >
+bool nse::inverse( // matrix[n * n] inverse
+	T* _RESTRICT inv_matrix,
+	const T* _RESTRICT const matrix, const int n)
+{
+	T _det = det(matrix, n);
+	if (fabs(_det) == (T)0) return false;
+
+	T inv_det = (T)1 / _det;
+	int i, j, idx;
+	for (i = 0; i < n; i++)
+	{
+		idx = i;
+		for (j = 0; j < n; j++, idx += n)
+			inv_matrix[idx] = inv_det *
+			cofactor(i + 1, j + 1, matrix, n);
+	}
+
+	return true;
+}
+// ----------------------------------------------------------------------------------------- //
+
+// * interpolation * //
+// ----------------- //
+template< typename T >
+inline T nse::interp_bilinear(
+	const T x, const T y,
+	const T v00, const T v10,
+	const T v01, const T v11)
+{
+	return v00 +
+		x * (v10 - v00) +
+		y * (v01 - v00) +
+		x * y * (v00 - v10 - v01 + v11);
+}
+
+template< typename T >
+inline T nse::interp_bilinear(
+	const T x, const T y,
+	const T sx, const T sy, const T dx, const T dy,
+	const T v00, const T v10,
+	const T v01, const T v11)
+{
+	return interp_bilinear((x - sx) / dx, (y - sy) / dy,
+		v00, v10, v01, v11);
+}
+
+template< typename T >
+inline T nse::interp_trilinear(
+	const T x, const T y, const T z,
+	const T v000, const T v100,
+	const T v010, const T v110,
+	const T v001, const T v101,
+	const T v011, const T v111)
+{
+	const T C0 = v000,
+		C1 = v100 - v000,
+		C2 = v010 - v000,
+		C3 = v001 - v000,
+		C4 = v110 - v010 - v100 + v000,
+		C5 = v011 - v001 - v010 + v000,
+		C6 = v101 - v001 - v100 + v000,
+		C7 = v111 - v011 - v101 - v110 + v100 + v001 + v010 - v000;
+
+	return C0 + C1 * x + C2 * y + C3 * z +
+		C4 * x * y + C5 * y * z + C6 * x * z +
+		C7 * x * y * z;
+}
+
+template< typename T >
+inline T nse::interp_trilinear(
+	const T x, const T y, const T z,
+	const T sx, const T sy, const T sz,
+	const T dx, const T dy, const T dz,
+	const T v000, const T v100,
+	const T v010, const T v110,
+	const T v001, const T v101,
+	const T v011, const T v111)
+{
+	return interp_trilinear(
+		(x - sx) / dx, (y - sy) / dy, (z - sz) / dz,
+		v000, v100, v010, v110, v001, v101, v011, v111);
+}
+
+template< typename T >
+inline T nse::interp(const T pz,
+	const T* _RESTRICT X, const T* _RESTRICT z, const int n)
+{
+	if (n == 0) return (T)0;
+	if (n == 1) return X[0];
+
+	int k;
+
+	k = 0;
+	if (pz < z[k]) {
+		// --- extrapolation, [zpos, zpos + dz/2):
+		T alpha = (z[k + 1] - pz) / (z[k + 1] - z[k]);
+		return alpha * X[k] + ((T)1.0 - alpha) * X[k + 1];
+	}
+
+	k = n - 1;
+	if (pz > z[k]) {
+		// --- extrapolation, (zpos + height - dz/2, zpos + height]
+		T alpha = (z[k] - pz) / (z[k] - z[k - 1]);
+		return alpha * X[k - 1] + ((T)1.0 - alpha) * X[k];
+	}
+
+	for (k = 0; k < n - 1; k++) {
+		if ((pz >= z[k]) && (pz <= z[k + 1]))
+		{
+			T alpha = (z[k + 1] - pz) / (z[k + 1] - z[k]);
+			return alpha * X[k] + ((T)1.0 - alpha) * X[k + 1];
+		}
+	}
+
+	return (T)0;
+}
+// ----------------------------------------------------------------------------------------- //
+
+template< typename T >
+void nse::runge_kutta_o4(
+	T* _RESTRICT ynext, T* _RESTRICT unext,
+	const T y0, const T u0, const T f,
+	const T alpha, const T beta,
+
+	const T dt, const int niters)
+{
+	T y = y0, u = u0;
+
+	T k1, k2, k3, k4,
+		l1, l2, l3, l4;
+
+	for (int i = 0; i < niters; i++) {
+		k1 = dt * u;
+		l1 = dt * (f - beta * y - alpha * u);
+
+		k2 = dt * (u + (T) 0.5 * l1);
+		l2 = dt * (f - beta * (y + (T) 0.5 * k1) - alpha * (u + (T) 0.5 * l1));
+
+		k3 = dt * (u + (T) 0.5 * l2);
+		l3 = dt * (f - beta * (y + (T) 0.5 * k2) - alpha * (u + (T) 0.5 * l2));
+
+		k4 = dt * (u + l3);
+		l4 = dt * (f - beta * (y + k3) - alpha * (u + l3));
+
+		y += ((T) 1.0 / (T) 6.0) * (k1 + (T) 2.0 * k2 + (T) 2.0 * k3 + k4);
+		u += ((T) 1.0 / (T) 6.0) * (l1 + (T) 2.0 * l2 + (T) 2.0 * l3 + l4);
+	}
+
+	(*ynext) = y;
+	(*unext) = u;
+}
+// ----------------------------------------------------------------------------------------- //
+
+// * simple damping functions * //
+// ---------------------------- //
+template< typename T >
+inline T nse::linear_damping(const T t, const T T0, const T T1)
+{
+	if (t <= T0) return (T) 0.0;
+	if (t >= T1) return (T) 1.0;
+
+	return (t - T0) / (T1 - T0);
+}
+
+template< typename T >
+inline T nse::exp_damping(const T t, const T T0, const T Tperiod)
+{
+	if (t <= T0) return (T) 0.0;
+
+	return (T) 1.0 - exp(-log((T)2.0) * (t - (T)T0) / Tperiod);
+}
+// ----------------------------------------------------------------------------------------- //
+
+
+// * check for finite values * //
+// --------------------------- //
+template< nse::memType mem, typename T >
+inline bool nse::is_finite(const T* _RESTRICT const x, const int n)
+{
+#ifndef EXCLUDE_GPU_BRANCH
+	if (mem == memGPU) return nse_gpu::is_finite(x, n);
+	else
+#endif
+	{
+		// memCPU //
+		int i;
+		int num_inf = 0;
+
+#pragma omp parallel for private( i ) reduction( + : num_inf )
+		for (i = 0; i < n - (n % 4); i += 4) {
+			if ((!isfinite(x[i])) || (!isfinite(x[i + 1])) ||
+				(!isfinite(x[i + 2])) || (!isfinite(x[i + 3]))) num_inf++;
+		}
+
+		for (i = n - (n % 4); i < n; i++)
+			if (!isfinite(x[i])) num_inf++;
+
+		return (num_inf == 0);
+	}
+}
+// ----------------------------------------------------------------------------------------- //
diff --git a/wstgrid3d.h b/wstgrid3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..794f1d34bc317a38057a5dd6b716af220bf8d666
--- /dev/null
+++ b/wstgrid3d.h
@@ -0,0 +1,2462 @@
+#pragma once
+
+// [wstgrid3d.h]: 3D wall-stretched grid
+//
+// -------------------------------------------------------------------------------------------- //
+
+#include "nse-sys.h"
+#include "grid3d.h"
+#include "vecmath.h"
+#include "mpi-vecmath.h"
+
+#ifndef EXCLUDE_GPU_BRANCH
+#include "wstgrid3d.cuh"
+#endif
+
+//#define USE_ERF_WSTGRID3D					// use ERF function for wall stretching
+
+
+namespace nse
+{
+	// * 3D wall-stretched grid: wstGrid3d< T > [ T = float, double ] * //
+	// =======================================================================
+	template< typename T, memType mem = memCPU >
+	class wstGrid3d : public Grid3d< T, mem >
+	{
+	public:
+		wstGrid3d();
+		wstGrid3d(const wstGrid3d<T, mem>& grid);
+		~wstGrid3d();
+
+
+		bool set(const T x, const T y, const T z,
+			const T length, const T width, const T height,
+			const T ksi_z,
+			const int cx, const int cy, const int cz,
+			const int gcx, const int gcy, const int gcz,
+
+			const int mpi_ndim);
+
+		bool set(const T x, const T y, const T z,
+			const T length, const T width, const T height,
+			const T ksi_z,
+			const int cx, const int cy, const int cz,
+			const int gcx, const int gcy, const int gcz,
+
+			const int mpi_size_x, const int mpi_size_y, const int mpi_size_z);
+
+		bool set_inf(const T x, const T y, const T z,
+			const T length, const T width, const T height,
+			const T ksi_z,
+			const int cx, const int cy, const int cz,
+			const int gcx, const int gcy, const int gcz,
+
+			const int mpi_ndim);
+
+		bool set_inf(const T x, const T y, const T z,
+			const T length, const T width, const T height,
+			const T ksi_z,
+			const int cx, const int cy, const int cz,
+			const int gcx, const int gcy, const int gcz,
+
+			const int mpi_size_x, const int mpi_size_y, const int mpi_size_z);
+
+		bool set_uni(const T x, const T y, const T z,
+			const T length, const T width, const T height,
+			const int cx, const int cy, const int cz,
+			const int gcx, const int gcy, const int gcz,
+
+			const int mpi_ndim);
+
+		bool set_uni(const T x, const T y, const T z,
+			const T length, const T width, const T height,
+			const int cx, const int cy, const int cz,
+			const int gcx, const int gcy, const int gcz,
+
+			const int mpi_size_x, const int mpi_size_y, const int mpi_size_z);
+
+		bool set(const T x, const T y, const T z,
+			const T length, const T width, const T height,
+			const T reg_min_z, const T reg_max_z,
+			const T ksi_z,
+			const int cx, const int cy, const int reg_cz,
+			const int gcx, const int gcy, const int gcz,
+
+			const int mpi_ndim);
+
+		bool set(const T x, const T y, const T z,
+			const T length, const T width, const T height,
+			const T reg_min_z, const T reg_max_z,
+			const T ksi_z,
+			const int cx, const int cy, const int reg_cz,
+			const int gcx, const int gcy, const int gcz,
+
+			const int mpi_size_x, const int mpi_size_y, const int mpi_size_z);
+
+		bool set_inf(const T x, const T y, const T z,
+			const T length, const T width, const T height,
+			const T reg_max_z,
+			const T ksi_z,
+			const int cx, const int cy, const int reg_cz,
+			const int gcx, const int gcy, const int gcz,
+
+			const int mpi_ndim);
+
+		bool set_inf(const T x, const T y, const T z,
+			const T length, const T width, const T height,
+			const T reg_max_z,
+			const T ksi_z,
+			const int cx, const int cy, const int reg_cz,
+			const int gcx, const int gcy, const int gcz,
+
+			const int mpi_size_x, const int mpi_size_y, const int mpi_size_z);
+
+
+		// MPI global cell index [== -1 - on failure]
+		int mpi_locate_x(const T x) const;
+		int mpi_locate_y(const T y) const;
+		int mpi_locate_z(const T z) const;
+
+		// locate cell index [== -1 on failure]
+		// - include MPI & domain ghost cells [non unique MPI call] >=0
+		int locate_inc_x(const T x) const;
+		int locate_inc_y(const T y) const;
+		int locate_inc_z(const T z) const;
+
+		// interpolation (local relative to (x,y,z) position in processor domain)
+		T c_interp(const T* X, const T x, const T y, const T z) const;
+		T u_interp(const T* U, const T x, const T y, const T z) const;
+		T v_interp(const T* V, const T x, const T y, const T z) const;
+		T w_interp(const T* W, const T x, const T y, const T z) const;
+
+		// local interpolation on single MPI process 
+		// [unsafe] - not checking if coordinates [(x,y,z),(i,j,k)] are correct
+		T u_interp_local(const T* _RESTRICT const U,
+			const T x, const T y, const T z, const int i, const int j, const int k) const;
+		T v_interp_local(const T* _RESTRICT const V,
+			const T x, const T y, const T z, const int i, const int j, const int k) const;
+		T w_interp_local(const T* _RESTRICT const W,
+			const T x, const T y, const T z, const int i, const int j, const int k) const;
+
+		void u_interp_local(T* _RESTRICT uinterp, const T* _RESTRICT const U,
+			const T* _RESTRICT const x, const T* _RESTRICT const y, const T* _RESTRICT const z,
+			const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const;
+		void v_interp_local(T* _RESTRICT vinterp, const T* _RESTRICT const V,
+			const T* _RESTRICT const x, const T* _RESTRICT const y, const T* _RESTRICT const z,
+			const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const;
+		void w_interp_local(T* _RESTRICT winterp, const T* _RESTRICT const W,
+			const T* _RESTRICT const x, const T* _RESTRICT const y, const T* _RESTRICT const z,
+			const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const;
+
+
+		// profiling (C,U,V,W) -> C node array
+		// -x profile
+		void c_profile_at_yz(T* Px, const T* X, const T y, const T z) const;
+		void u_profile_at_yz(T* Px, const T* U, const T y, const T z) const;
+		void v_profile_at_yz(T* Px, const T* V, const T y, const T z) const;
+		void w_profile_at_yz(T* Px, const T* W, const T y, const T z) const;
+		// -y profile
+		void c_profile_at_xz(T* Py, const T* X, const T x, const T z) const;
+		void u_profile_at_xz(T* Py, const T* U, const T x, const T z) const;
+		void v_profile_at_xz(T* Py, const T* V, const T x, const T z) const;
+		void w_profile_at_xz(T* Py, const T* W, const T x, const T z) const;
+		// -z profile
+		void c_profile_at_xy(T* Pz, const T* X, const T x, const T y) const;
+		void u_profile_at_xy(T* Pz, const T* U, const T x, const T y) const;
+		void v_profile_at_xy(T* Pz, const T* V, const T x, const T y) const;
+		void w_profile_at_xy(T* Pz, const T* W, const T x, const T y) const;
+
+		// averaging (C,U,V,W) -> (C,U,V,W) (precise: no sub-averaging to C)
+		// U,V,W: have to assume periodic b.c's (or dirichlet at west,south,bottom)
+		template< nse_const3d::nodeType node = nse_const3d::nodeC, typename PType >	// -x average [ X(nx,ny,nz)->Pyz(ny,nz) ]
+		void average_by_x(PType* _RESTRICT Pyz, const T* _RESTRICT const X) const;
+		template< nse_const3d::nodeType node = nse_const3d::nodeC, typename PType >	// -y average [ X(nx,ny,nz)->Pxz(nx,nz) ]
+		void average_by_y(PType* _RESTRICT Pxz, const T* _RESTRICT const X) const;
+		template< nse_const3d::nodeType node = nse_const3d::nodeC, typename PType >	// -z average [ X(nx,ny,nz)->Pxy(nx,ny) ]
+		void average_by_z(PType* _RESTRICT Pxy, const T* _RESTRICT const X) const;
+		template< nse_const3d::nodeType node = nse_const3d::nodeC, typename PType >	// -yz average [ X(nx,ny,nz)->Px(nx) ]
+		void average_by_yz(PType* _RESTRICT Px, const T* _RESTRICT const X) const;
+		template< nse_const3d::nodeType node = nse_const3d::nodeC, typename PType >	// -xz average [ X(nx,ny,nz)->Py(ny) ]
+		void average_by_xz(PType* _RESTRICT Py, const T* _RESTRICT const X) const;
+		template< nse_const3d::nodeType node = nse_const3d::nodeC, typename PType >	// -xy average [ X(nx,ny,nz)->Pz(nz) ]
+		void average_by_xy(PType* _RESTRICT Pz, const T* _RESTRICT const X) const;
+		template< typename PType = T >	// -xyz average [ X(nx,ny,nz)-> return value ]
+		PType average(const T* _RESTRICT const X) const;
+
+		template< typename PType = T, typename CType >
+		PType average_ifeq(const T* _RESTRICT const X,
+			const CType* _RESTRICT const mask, const CType check) const;
+
+		// grid re-interpolation out(current grid), in(input grid)
+		void grid_reinterp(T* Xout, const T* Xin,				// local in array //
+			const nse_const3d::nodeType node, const GridId< T >& id) const;
+
+		void c_grid_reinterp(T* Xout, const T* Xin, const wstGrid3d< T, memCPU >& grid) const;
+		void u_grid_reinterp(T* Uout, const T* Uin, const wstGrid3d< T, memCPU >& grid) const;
+		void v_grid_reinterp(T* Vout, const T* Vin, const wstGrid3d< T, memCPU >& grid) const;
+		void w_grid_reinterp(T* Wout, const T* Win, const wstGrid3d< T, memCPU >& grid) const;
+
+		// GridId //
+		void set_id(GridId< T >& id) const;
+		bool check_id(const GridId< T >& id) const;
+
+		void set_id(GridId< T >& id, const nse_const3d::axisType axis) const;
+		bool check_id(const GridId< T >& id, const nse_const3d::axisType axis) const;
+
+	private:
+
+		// main grid parameters init routine, mpi communicator assumed initialized //
+		bool init_grid(
+			const T _x, const T _y, const T _z,
+			const T _length, const T _width, const T _height,
+			const T _ksi_z, const T* zline,
+			const int _cx, const int _cy, const int _cz,
+			const int _gcx, const int _gcy, const int _gcz);
+
+
+		// make stretched grid coordinates		
+		// _min + 0.5 * (_max - _min) * (1 + (tanh(nu)/tanh(_Ksi))) : -_Ksi <= nu <= _Ksi
+		int make_coordinates(
+			T** line,
+			const T _min, const T _max,
+			const int _cx, const T _Ksi);
+
+		// make stretched grid coordinates to infinity		
+		// _min + (_max - _min) * (1 + (tanh(nu)/tanh(_Ksi))) : -_Ksi <= nu <= 0
+		int make_coordinates_inf(
+			T** line,
+			const T _min, const T _max,
+			const int _cx, const T _Ksi);
+
+		// make uniform grid	
+		int make_coordinates_uni(
+			T** line,
+			const T _min, const T _max,
+			const int _cx);
+
+		// make stretched grid coordinates
+		// _reg_min + 0.5 * (_reg_max - _reg_min) * (1 + (tanh(nu)/tanh(_Ksi))) : -_Ksi <= nu <= _Ksi
+		// _min : _reg_min --- uniform grid with min step
+		// _reg_max : _max --- uniform grid with min step
+		int make_coordinates(
+			T** line,
+			const T _min, const T _max,
+			const T _reg_min, const T _reg_max,
+			const int _reg_cx, const T _Ksi);
+
+		// make stretched grid coordinates to infinity
+		// _min + (_reg - _min) * (tanh(nu)/tanh(_Ksi)) : 0 <= nu <= _Ksi
+		// _reg : _max --- uniform grid with min step
+		int make_coordinates_inf(
+			T** line,
+			const T _min, const T _max,
+			const T _reg,
+			const int _reg_cx, const T _Ksi);
+
+		// _min + 0.5 * (_half - _min) * (1 + (tanh(nu)/tanh(_Ksi))) : -_Ksi <= nu <= _Ksi
+		// _half + 0.5 * (_max - _half) * (1 + (tanh(nu)/tanh(_Ksi))) : _Ksi <= nu <= _Ksi
+		int make_coordinates_hplane(
+			T** line,
+			const T _min, const T _max,
+			const int _cx, const T _ksi);
+
+
+	public:
+
+		using Grid3d<T, mem>::mpi_com;
+
+		using Grid3d<T, mem>::size; using Grid3d<T, mem>::nx; using Grid3d<T, mem>::ny; using Grid3d<T, mem>::nz;
+		using Grid3d<T, mem>::nyz;
+		using Grid3d<T, mem>::mpi_nx; using Grid3d<T, mem>::mpi_ny; using Grid3d<T, mem>::mpi_nz;
+		using Grid3d<T, mem>::mpi_nxy; using Grid3d<T, mem>::mpi_nxz; using Grid3d<T, mem>::mpi_nyz;
+		using Grid3d<T, mem>::mpi_size;
+
+		using Grid3d<T, mem>::gcx; using Grid3d<T, mem>::gcy; using Grid3d<T, mem>::gcz;
+
+		using Grid3d<T, mem>::x; using Grid3d<T, mem>::y; using Grid3d<T, mem>::z;
+		using Grid3d<T, mem>::length; using Grid3d<T, mem>::width; using Grid3d<T, mem>::height;
+
+		using Grid3d<T, mem>::mpi_x; using Grid3d<T, mem>::mpi_y; using Grid3d<T, mem>::mpi_z;
+		using Grid3d<T, mem>::mpi_length; using Grid3d<T, mem>::mpi_width; using Grid3d<T, mem>::mpi_height;
+
+		using Grid3d<T, mem>::px; using Grid3d<T, mem>::py; using Grid3d<T, mem>::pz;	// cell-center coordinates //
+		using Grid3d<T, mem>::ex; using Grid3d<T, mem>::ey; using Grid3d<T, mem>::ez;	// cell-edge coordinates //
+
+		using Grid3d<T, mem>::i_local_coord;
+		using Grid3d<T, mem>::j_local_coord;
+		using Grid3d<T, mem>::k_local_coord;
+
+		using Grid3d<T, mem>::locate_x;
+		using Grid3d<T, mem>::locate_y;
+		using Grid3d<T, mem>::locate_z;
+
+		T ksi_z;
+
+		T dx, dy, dxh, dyh;
+		T dxi, dyi, dxih, dyih, dxiq, dyiq, dxiqh, dyiqh;
+		T dx2i, dy2i, dx2ih, dy2ih, dx2iq, dy2iq;
+		T dx2id, dy2id;
+
+		T *dz, *dzh;
+		T *dzi, *dzih, *dziq;
+		T *dz2i, *dz2ih, *dz2iq;
+		T *dz2id;
+
+		T *dzp, *dzm;
+		T *dzpi, *dzmi;
+		T *dzpih, *dzmih;
+
+		T *dzp2i, *dzm2i;
+
+		// MPI parameters (global) //
+		T dz_max, dz_min;
+		T dzi_max, dzi_min;
+		// ----------------------- //
+	};
+}
+
+
+namespace nse
+{
+	template< typename T, memType mem >
+	wstGrid3d< T, mem > ::wstGrid3d(
+	) : Grid3d<T, mem>(),
+
+		ksi_z((T)0)
+	{
+	}
+
+	template< typename T, memType mem >
+	wstGrid3d< T, mem > ::wstGrid3d(
+		const wstGrid3d< T, mem >& grid)
+		: Grid3d< T, mem >(grid),
+
+		ksi_z(grid.ksi_z),
+
+		dx(grid.dx), dy(grid.dy),
+		dxh(grid.dxh), dyh(grid.dyh),
+		dxi(grid.dxi), dyi(grid.dyi),
+		dxih(grid.dxih), dyih(grid.dyih),
+		dxiq(grid.dxiq), dyiq(grid.dyiq),
+		dxiqh(grid.dxiqh), dyiqh(grid.dyiqh),
+		dx2i(grid.dx2i), dy2i(grid.dy2i),
+		dx2ih(grid.dx2ih), dy2ih(grid.dy2ih),
+		dx2iq(grid.dx2iq), dy2iq(grid.dy2iq),
+		dx2id(grid.dx2id), dy2id(grid.dy2id)
+	{
+		if (grid.size > 0)
+		{
+			allocate_vnull<mem>(&dz, nz); mcopy<mem, mem>(dz, grid.dz, nz);
+			allocate_vnull<mem>(&dzh, nz); mcopy<mem, mem>(dzh, grid.dzh, nz);
+
+			allocate_vnull<mem>(&dzi, nz); mcopy<mem, mem>(dzi, grid.dzi, nz);
+			allocate_vnull<mem>(&dzih, nz); mcopy<mem, mem>(dzih, grid.dzih, nz);
+			allocate_vnull<mem>(&dziq, nz); mcopy<mem, mem>(dziq, grid.dziq, nz);
+
+			allocate_vnull<mem>(&dz2i, nz); mcopy<mem, mem>(dz2i, grid.dz2i, nz);
+			allocate_vnull<mem>(&dz2ih, nz); mcopy<mem, mem>(dz2ih, grid.dz2ih, nz);
+			allocate_vnull<mem>(&dz2iq, nz); mcopy<mem, mem>(dz2iq, grid.dz2iq, nz);
+			allocate_vnull<mem>(&dz2id, nz); mcopy<mem, mem>(dz2id, grid.dz2id, nz);
+
+			allocate_vnull<mem>(&dzp, nz); mcopy<mem, mem>(dzp, grid.dzp, nz);
+			allocate_vnull<mem>(&dzm, nz); mcopy<mem, mem>(dzm, grid.dzm, nz);
+
+			allocate_vnull<mem>(&dzpi, nz); mcopy<mem, mem>(dzpi, grid.dzpi, nz);
+			allocate_vnull<mem>(&dzmi, nz); mcopy<mem, mem>(dzmi, grid.dzmi, nz);
+
+			allocate_vnull<mem>(&dzpih, nz); mcopy<mem, mem>(dzpih, grid.dzpih, nz);
+			allocate_vnull<mem>(&dzmih, nz); mcopy<mem, mem>(dzmih, grid.dzmih, nz);
+
+			allocate_vnull<mem>(&dzp2i, nz); mcopy<mem, mem>(dzp2i, grid.dzp2i, nz);
+			allocate_vnull<mem>(&dzm2i, nz); mcopy<mem, mem>(dzm2i, grid.dzm2i, nz);
+
+			dz_min = grid.dz_min; dz_max = grid.dz_max;
+			dzi_min = grid.dzi_min; dzi_max = grid.dzi_max;
+		}
+	}
+
+	template< typename T, memType mem >
+	wstGrid3d< T, mem > :: ~wstGrid3d(
+	)
+	{
+		if (size > 0)
+		{
+			deallocate<mem>(dz);
+			deallocate<mem>(dzh);
+
+			deallocate<mem>(dzi);
+			deallocate<mem>(dzih);
+			deallocate<mem>(dziq);
+
+			deallocate<mem>(dz2i);
+			deallocate<mem>(dz2ih);
+			deallocate<mem>(dz2iq);
+			deallocate<mem>(dz2id);
+
+			deallocate<mem>(dzp);
+			deallocate<mem>(dzm);
+			deallocate<mem>(dzpi);
+			deallocate<mem>(dzmi);
+			deallocate<mem>(dzpih);
+			deallocate<mem>(dzmih);
+
+			deallocate<mem>(dzp2i);
+			deallocate<mem>(dzm2i);
+		}
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::set(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const T _ksi_z,
+		const int _cx, const int _cy, const int _cz,
+		const int _gcx, const int _gcy, const int _gcz,
+		const int mpi_ndim)
+	{
+		mpi_com.set(mpi_ndim);
+
+		// - init -z line coordinates
+		T *zline;
+		int znum = make_coordinates(&zline, _z, _z + _height, _cz, _ksi_z);
+		if ((znum < 0) && (znum != _cz + 1)) return false;
+
+		// - init grid
+		bool status = init_grid(_x, _y, _z, _length, _width, _height,
+			_ksi_z, zline, _cx, _cy, znum - 1, _gcx, _gcy, _gcz);
+
+		deallocate(zline);
+		return status;
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::set(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const T _ksi_z,
+		const int _cx, const int _cy, const int _cz,
+		const int _gcx, const int _gcy, const int _gcz,
+		const int mpi_size_x, const int mpi_size_y, const int mpi_size_z)
+	{
+		mpi_com.set(mpi_size_x, mpi_size_y, mpi_size_z);
+
+		// - init -z line coordinates
+		T *zline;
+		int znum = make_coordinates(&zline, _z, _z + _height, _cz, _ksi_z);
+		if ((znum < 0) && (znum != _cz + 1)) return false;
+
+		// - init grid
+		bool status = init_grid(_x, _y, _z, _length, _width, _height,
+			_ksi_z, zline, _cx, _cy, znum - 1, _gcx, _gcy, _gcz);
+
+		deallocate(zline);
+		return status;
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::set_inf(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const T _ksi_z,
+		const int _cx, const int _cy, const int _cz,
+		const int _gcx, const int _gcy, const int _gcz,
+		const int mpi_ndim)
+	{
+		mpi_com.set(mpi_ndim);
+
+		// - init -z line coordinates
+		T *zline;
+		int znum = make_coordinates_inf(&zline, _z, _z + _height, _cz, _ksi_z);
+		if ((znum < 0) && (znum != _cz + 1)) return false;
+
+		// - init grid
+		bool status = init_grid(_x, _y, _z, _length, _width, _height,
+			_ksi_z, zline, _cx, _cy, znum - 1, _gcx, _gcy, _gcz);
+
+		deallocate(zline);
+		return status;
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::set_inf(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const T _ksi_z,
+		const int _cx, const int _cy, const int _cz,
+		const int _gcx, const int _gcy, const int _gcz,
+		const int mpi_size_x, const int mpi_size_y, const int mpi_size_z)
+	{
+		mpi_com.set(mpi_size_x, mpi_size_y, mpi_size_z);
+
+		// - init -z line coordinates
+		T *zline;
+		int znum = make_coordinates_inf(&zline, _z, _z + _height, _cz, _ksi_z);
+		if ((znum < 0) && (znum != _cz + 1)) return false;
+
+		// - init grid
+		bool status = init_grid(_x, _y, _z, _length, _width, _height,
+			_ksi_z, zline, _cx, _cy, znum - 1, _gcx, _gcy, _gcz);
+
+		deallocate(zline);
+		return status;
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::set_uni(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const int _cx, const int _cy, const int _cz,
+		const int _gcx, const int _gcy, const int _gcz,
+		const int mpi_ndim)
+	{
+		mpi_com.set(mpi_ndim);
+
+		// - init -z line coordinates
+		T *zline;
+		int znum = make_coordinates_uni(&zline, _z, _z + _height, _cz);
+		if ((znum < 0) && (znum != _cz + 1)) return false;
+
+		// - init grid
+		bool status = init_grid(_x, _y, _z, _length, _width, _height,
+			(T)0, zline, _cx, _cy, znum - 1, _gcx, _gcy, _gcz);
+		// modify init grid to remove ksi_z as input
+
+		deallocate(zline);
+		return status;
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::set_uni(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const int _cx, const int _cy, const int _cz,
+		const int _gcx, const int _gcy, const int _gcz,
+		const int mpi_size_x, const int mpi_size_y, const int mpi_size_z)
+	{
+		mpi_com.set(mpi_size_x, mpi_size_y, mpi_size_z);
+
+		// - init -z line coordinates
+		T *zline;
+		int znum = make_coordinates_uni(&zline, _z, _z + _height, _cz);
+		if ((znum < 0) && (znum != _cz + 1)) return false;
+
+		// - init grid
+		bool status = init_grid(_x, _y, _z, _length, _width, _height,
+			(T)0, zline, _cx, _cy, znum - 1, _gcx, _gcy, _gcz);
+		// modify init grid to remove ksi_z as input
+
+		deallocate(zline);
+		return status;
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::set(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const T _reg_min_z, const T _reg_max_z,
+		const T _ksi_z,
+		const int _cx, const int _cy, const int _reg_cz,
+		const int _gcx, const int _gcy, const int _gcz,
+		const int mpi_ndim)
+	{
+		mpi_com.set(mpi_ndim);
+
+		// - init -z line coordinates
+		T *zline;
+		int znum = make_coordinates(&zline, _z, _z + _height, _reg_min_z, _reg_max_z, _reg_cz, _ksi_z);
+		if (znum < 0) return false;
+
+		// - init grid
+		bool status = init_grid(_x, _y, _z, _length, _width, _height,
+			_ksi_z, zline, _cx, _cy, znum - 1, _gcx, _gcy, _gcz);
+
+		deallocate(zline);
+		return status;
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::set(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const T _reg_min_z, const T _reg_max_z,
+		const T _ksi_z,
+		const int _cx, const int _cy, const int _reg_cz,
+		const int _gcx, const int _gcy, const int _gcz,
+		const int mpi_size_x, const int mpi_size_y, const int mpi_size_z)
+	{
+		mpi_com.set(mpi_size_x, mpi_size_y, mpi_size_z);
+
+		// - init -z line coordinates
+		T *zline;
+		int znum = make_coordinates(&zline, _z, _z + _height, _reg_min_z, _reg_max_z, _reg_cz, _ksi_z);
+		if (znum < 0) return false;
+
+		// - init grid
+		bool status = init_grid(_x, _y, _z, _length, _width, _height,
+			_ksi_z, zline, _cx, _cy, znum - 1, _gcx, _gcy, _gcz);
+
+		deallocate(zline);
+		return status;
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::set_inf(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const T _reg_max_z,
+		const T _ksi_z,
+		const int _cx, const int _cy, const int _reg_cz,
+		const int _gcx, const int _gcy, const int _gcz,
+		const int mpi_ndim)
+	{
+		mpi_com.set(mpi_ndim);
+
+		// - init -z line coordinates
+		T *zline;
+		int znum = make_coordinates_inf(&zline, _z, _z + _height, _reg_max_z, _reg_cz, _ksi_z);
+		if (znum < 0) return false;
+
+		// - init grid
+		bool status = init_grid(_x, _y, _z, _length, _width, _height,
+			_ksi_z, zline, _cx, _cy, znum - 1, _gcx, _gcy, _gcz);
+
+		deallocate(zline);
+		return status;
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::set_inf(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const T _reg_max_z,
+		const T _ksi_z,
+		const int _cx, const int _cy, const int _reg_cz,
+		const int _gcx, const int _gcy, const int _gcz,
+		const int mpi_size_x, const int mpi_size_y, const int mpi_size_z)
+	{
+		mpi_com.set(mpi_size_x, mpi_size_y, mpi_size_z);
+
+		// - init -z line coordinates
+		T *zline;
+		int znum = make_coordinates_inf(&zline, _z, _z + _height, _reg_max_z, _reg_cz, _ksi_z);
+		if (znum < 0) return false;
+
+		// - init grid
+		bool status = init_grid(_x, _y, _z, _length, _width, _height,
+			_ksi_z, zline, _cx, _cy, znum - 1, _gcx, _gcy, _gcz);
+
+		deallocate(zline);
+		return status;
+	}
+
+	template< typename T, memType mem >
+	int wstGrid3d< T, mem > ::mpi_locate_x(const T _x) const
+	{
+		int i;
+		T px = mpi_x;
+		for (i = gcx; i < mpi_nx - gcx; i++, px += dx)
+			if ((_x >= px) && (_x <= px + dx)) { return i; }
+
+		return -1;
+	}
+
+	template< typename T, memType mem >
+	int wstGrid3d< T, mem > ::mpi_locate_y(const T _y) const
+	{
+		int j;
+		T py = mpi_y;
+		for (j = gcy; j < mpi_ny - gcy; j++, py += dy)
+			if ((_y >= py) && (_y <= py + dy)) { return j; }
+
+		return -1;
+	}
+
+	template< typename T, memType mem >
+	int wstGrid3d< T, mem > ::mpi_locate_z(const T _z) const
+	{
+		int k = locate_z(_z);
+
+		int p_rank = (k >= 0) ? mpi_com.rank_z : -1;
+		mpi_allreduce(&p_rank, MPI_MAX, mpi_com.comm);
+		if (p_rank == -1) return -1;
+
+		int shz = (mpi_com.rank_z < p_rank) ? nz - 2 * gcz : 0;
+		if (mpi_com.rank_z == p_rank) shz = k;
+
+		mpi_allreduce(&shz, MPI_SUM, mpi_com.comm);
+		shz /= (mpi_com.size_x * mpi_com.size_y);
+
+		return shz;
+	}
+
+	template< typename T, memType mem >
+	int wstGrid3d< T, mem > ::locate_inc_x(const T _x) const
+	{
+		int i;
+		T px = x - gcx * dx;
+
+		for (i = 0; i < nx; i++, px += dx)
+			if ((_x >= px) && (_x <= px + dx)) { return i; }
+
+		return -1;
+	}
+
+	template< typename T, memType mem >
+	int wstGrid3d< T, mem > ::locate_inc_y(const T _y) const
+	{
+		int j;
+		T py = y - gcy * dy;
+
+		for (j = 0; j < ny; j++, py += dy)
+			if ((_y >= py) && (_y <= py + dy)) { return j; }
+
+		return -1;
+	}
+
+	template< typename T, memType mem >
+	int wstGrid3d< T, mem > ::locate_inc_z(const T _z) const
+	{
+		int k;
+		for (k = 0; k < nz; k++)
+			if ((_z >= pz[k] - dzh[k]) && (_z <= pz[k] + dzh[k])) { return k; }
+
+		return -1;
+	}
+
+	template< typename T, memType mem >
+	T wstGrid3d< T, mem > ::c_interp(const T* X, const T _px, const T _py, const T _pz) const
+	{
+		int i = locate_x(_px), j = locate_y(_py), k = locate_z(_pz);
+		T p_value = (T) 0.0;
+
+		if ((i >= 0) && (j >= 0) && (k >= 0)) {
+
+			int ipos = (_px < px[i]) ? i - 1 : i;	// -x line setup //
+			T xinterp = (_px - px[ipos]) * dxi;
+
+			int jpos = (_py < py[j]) ? j - 1 : j;	// -y line setup //
+			T yinterp = (_py - py[jpos]) * dyi;
+
+			int kpos = (_pz < pz[k]) ? k - 1 : k;	// -z line setup //
+			T zinterp = (_pz - pz[kpos]) * (T)2.0 * dzpi[kpos];
+
+			int index = ipos * nyz + jpos * nz + kpos;
+
+			p_value = interp_trilinear(xinterp, yinterp, zinterp,
+				X[index], X[index + nyz],
+				X[index + nz], X[index + nyz + nz],
+				X[index + 1], X[index + nyz + 1],
+				X[index + nz + 1], X[index + nyz + nz + 1]);
+		}
+
+		return p_value;
+	}
+
+	template< typename T, memType mem >
+	T wstGrid3d< T, mem > ::u_interp(const T* U, const T _px, const T _py, const T _pz) const
+	{
+		int i = locate_x(_px), j = locate_y(_py), k = locate_z(_pz);
+		T p_value = (T) 0.0;
+
+		if ((i >= 0) && (j >= 0) && (k >= 0)) {
+
+			int ipos = i;							// -x line setup //
+			T xinterp = (_px - (px[ipos] - dxh)) * dxi;
+
+			int jpos = (_py < py[j]) ? j - 1 : j;	// -y line setup //
+			T yinterp = (_py - py[jpos]) * dyi;
+
+			int kpos = (_pz < pz[k]) ? k - 1 : k;	// -z line setup //
+			T zinterp = (_pz - pz[kpos]) * (T)2.0 * dzpi[kpos];
+
+			int index = ipos * nyz + jpos * nz + kpos;
+
+			p_value = interp_trilinear(xinterp, yinterp, zinterp,
+				U[index], U[index + nyz],
+				U[index + nz], U[index + nyz + nz],
+				U[index + 1], U[index + nyz + 1],
+				U[index + nz + 1], U[index + nyz + nz + 1]);
+		}
+
+		return p_value;
+	}
+
+	template< typename T, memType mem >
+	T wstGrid3d< T, mem > ::v_interp(const T* V, const T _px, const T _py, const T _pz) const
+	{
+		int i = locate_x(_px), j = locate_y(_py), k = locate_z(_pz);
+		T p_value = (T) 0.0;
+
+		if ((i >= 0) && (j >= 0) && (k >= 0)) {
+
+			int ipos = (_px < px[i]) ? i - 1 : i;	// -x line setup //
+			T xinterp = (_px - px[ipos]) * dxi;
+
+			int jpos = j;							// -y line setup //
+			T yinterp = (_py - (py[jpos] - dyh)) * dyi;
+
+			int kpos = (_pz < pz[k]) ? k - 1 : k;	// -z line setup //
+			T zinterp = (_pz - pz[kpos]) * (T)2.0 * dzpi[kpos];
+
+			int index = ipos * nyz + jpos * nz + kpos;
+
+			p_value = interp_trilinear(xinterp, yinterp, zinterp,
+				V[index], V[index + nyz],
+				V[index + nz], V[index + nyz + nz],
+				V[index + 1], V[index + nyz + 1],
+				V[index + nz + 1], V[index + nyz + nz + 1]);
+		}
+
+		return p_value;
+	}
+
+	template< typename T, memType mem >
+	T wstGrid3d< T, mem > ::w_interp(const T* W, const T _px, const T _py, const T _pz) const
+	{
+		int i = locate_x(_px), j = locate_y(_py), k = locate_z(_pz);
+		T p_value = (T) 0.0;
+
+		if ((i >= 0) && (j >= 0) && (k >= 0)) {
+
+			int ipos = (_px < px[i]) ? i - 1 : i;	// -x line setup //
+			T xinterp = (_px - px[ipos]) * dxi;
+
+			int jpos = (_py < py[j]) ? j - 1 : j;	// -y line setup //
+			T yinterp = (_py - py[jpos]) * dyi;
+
+			int kpos = k;							// -z line setup //
+			T zinterp = (_pz - (pz[kpos] - dzh[kpos])) * dzi[kpos];
+
+			int index = ipos * nyz + jpos * nz + kpos;
+
+			p_value = interp_trilinear(xinterp, yinterp, zinterp,
+				W[index], W[index + nyz],
+				W[index + nz], W[index + nyz + nz],
+				W[index + 1], W[index + nyz + 1],
+				W[index + nz + 1], W[index + nyz + nz + 1]);
+		}
+
+		return p_value;
+	}
+
+	template< typename T, memType mem >
+	T wstGrid3d< T, mem > ::u_interp_local(const T* _RESTRICT const U, 
+		const T _px, const T _py, const T _pz,
+		const int i, const int j, const int k) const
+	{
+		int ipos = i;							// -x line setup //
+		T xinterp = (_px - (px[ipos] - dxh)) * dxi;
+
+		int jpos = (_py < py[j]) ? j - 1 : j;	// -y line setup //
+		T yinterp = (_py - py[jpos]) * dyi;
+
+		int kpos = (_pz < pz[k]) ? k - 1 : k;	// -z line setup //
+		T zinterp = (_pz - pz[kpos]) * (T)2.0 * dzpi[kpos];
+
+		int index = ipos * nyz + jpos * nz + kpos;
+
+		return interp_trilinear(xinterp, yinterp, zinterp,
+			U[index], U[index + nyz],
+			U[index + nz], U[index + nyz + nz],
+			U[index + 1], U[index + nyz + 1],
+			U[index + nz + 1], U[index + nyz + nz + 1]);
+	}
+
+	template< typename T, memType mem >
+	T wstGrid3d< T, mem > ::v_interp_local(const T* _RESTRICT const V, 
+		const T _px, const T _py, const T _pz,
+		const int i, const int j, const int k) const
+	{
+		int ipos = (_px < px[i]) ? i - 1 : i;	// -x line setup //
+		T xinterp = (_px - px[ipos]) * dxi;
+
+		int jpos = j;							// -y line setup //
+		T yinterp = (_py - (py[jpos] - dyh)) * dyi;
+
+		int kpos = (_pz < pz[k]) ? k - 1 : k;	// -z line setup //
+		T zinterp = (_pz - pz[kpos]) * (T)2.0 * dzpi[kpos];
+
+		int index = ipos * nyz + jpos * nz + kpos;
+
+		return interp_trilinear(xinterp, yinterp, zinterp,
+			V[index], V[index + nyz],
+			V[index + nz], V[index + nyz + nz],
+			V[index + 1], V[index + nyz + 1],
+			V[index + nz + 1], V[index + nyz + nz + 1]);
+	}
+
+	template< typename T, memType mem >
+	T wstGrid3d< T, mem > ::w_interp_local(const T* _RESTRICT const W, 
+		const T _px, const T _py, const T _pz,
+		const int i, const int j, const int k) const
+	{
+		int ipos = (_px < px[i]) ? i - 1 : i;	// -x line setup //
+		T xinterp = (_px - px[ipos]) * dxi;
+
+		int jpos = (_py < py[j]) ? j - 1 : j;	// -y line setup //
+		T yinterp = (_py - py[jpos]) * dyi;
+
+		int kpos = k;							// -z line setup //
+		T zinterp = (_pz - (pz[kpos] - dzh[kpos])) * dzi[kpos];
+
+		int index = ipos * nyz + jpos * nz + kpos;
+
+		return interp_trilinear(xinterp, yinterp, zinterp,
+			W[index], W[index + nyz],
+			W[index + nz], W[index + nyz + nz],
+			W[index + 1], W[index + nyz + 1],
+			W[index + nz + 1], W[index + nyz + nz + 1]);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem >::u_interp_local(T* _RESTRICT uinterp, const T* _RESTRICT const U,
+		const T* _RESTRICT const _px, const T* _RESTRICT const _py, const T* _RESTRICT const _pz,
+		const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const
+	{
+		int m;
+		int ipos, jpos, kpos, index;
+		T xinterp, yinterp, zinterp;
+
+#pragma omp parallel for private( m, ipos, jpos, kpos, index, xinterp, yinterp, zinterp ) shared(uinterp)
+		for (m = 0; m < n; m++)
+		{
+			ipos = i[m];									// -x line setup //
+			xinterp = (_px[m] - (px[ipos] - dxh)) * dxi;
+
+			jpos = (_py[m] < py[j[m]]) ? j[m] - 1 : j[m];	// -y line setup //
+			yinterp = (_py[m] - py[jpos]) * dyi;
+
+			kpos = (_pz[m] < pz[k[m]]) ? k[m] - 1 : k[m];	// -z line setup //
+			zinterp = (_pz[m] - pz[kpos]) * (T)2.0 * dzpi[kpos];
+
+			index = ipos * nyz + jpos * nz + kpos;
+
+			uinterp[m] = interp_trilinear(xinterp, yinterp, zinterp,
+				U[index], U[index + nyz],
+				U[index + nz], U[index + nyz + nz],
+				U[index + 1], U[index + nyz + 1],
+				U[index + nz + 1], U[index + nyz + nz + 1]);
+		}
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem >::v_interp_local(T* _RESTRICT vinterp, const T* _RESTRICT const V,
+		const T* _RESTRICT const _px, const T* _RESTRICT const _py, const T* _RESTRICT const _pz,
+		const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const
+	{
+		int m;
+		int ipos, jpos, kpos, index;
+		T xinterp, yinterp, zinterp;
+
+#pragma omp parallel for private( m, ipos, jpos, kpos, index, xinterp, yinterp, zinterp ) shared(vinterp)
+		for (m = 0; m < n; m++)
+		{
+			ipos = (_px[m] < px[i[m]]) ? i[m] - 1 : i[m];	// -x line setup //
+			xinterp = (_px[m] - px[ipos]) * dxi;
+
+			jpos = j[m];									// -y line setup //
+			yinterp = (_py[m] - (py[jpos] - dyh)) * dyi;
+
+			kpos = (_pz[m] < pz[k[m]]) ? k[m] - 1 : k[m];	// -z line setup //
+			zinterp = (_pz[m] - pz[kpos]) * (T)2.0 * dzpi[kpos];
+
+			index = ipos * nyz + jpos * nz + kpos;
+
+			vinterp[m] = interp_trilinear(xinterp, yinterp, zinterp,
+				V[index], V[index + nyz],
+				V[index + nz], V[index + nyz + nz],
+				V[index + 1], V[index + nyz + 1],
+				V[index + nz + 1], V[index + nyz + nz + 1]);
+		}
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem >::w_interp_local(T* _RESTRICT winterp, const T* _RESTRICT const W,
+		const T* _RESTRICT const _px, const T* _RESTRICT const _py, const T* _RESTRICT const _pz,
+		const int* _RESTRICT const i, const int* _RESTRICT const j, const int* _RESTRICT const k, const int n) const
+	{
+		int m;
+		int ipos, jpos, kpos, index;
+		T xinterp, yinterp, zinterp;
+
+#pragma omp parallel for private( m, ipos, jpos, kpos, index, xinterp, yinterp, zinterp ) shared(winterp)
+		for (m = 0; m < n; m++)
+		{
+			ipos = (_px[m] < px[i[m]]) ? i[m] - 1 : i[m];	// -x line setup //
+			xinterp = (_px[m] - px[ipos]) * dxi;
+
+			jpos = (_py[m] < py[j[m]]) ? j[m] - 1 : j[m];	// -y line setup //
+			yinterp = (_py[m] - py[jpos]) * dyi;
+
+			kpos = k[m];									// -z line setup //
+			zinterp = (_pz[m] - (pz[kpos] - dzh[kpos])) * dzi[kpos];
+
+			index = ipos * nyz + jpos * nz + kpos;
+
+			winterp[m] = interp_trilinear(xinterp, yinterp, zinterp,
+				W[index], W[index + nyz],
+				W[index + nz], W[index + nyz + nz],
+				W[index + 1], W[index + nyz + 1],
+				W[index + nz + 1], W[index + nyz + nz + 1]);
+		}
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::c_profile_at_yz(T* Px, const T* X, const T _py, const T _pz) const
+	{
+		int i, j = locate_y(_py), k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Px, nx);
+		if ((j >= gcy) && (j < ny - gcy) && (k >= gcz) && (k < nz - gcz)) {
+
+			int jpos = (_py < py[j]) ? j : j + 1;
+			int kpos = (_pz < pz[k]) ? k : k + 1;
+
+			// trilinear interpolation == bilinear interpolation ( _px == _cx )
+#pragma omp parallel for private( i, index ) shared( Px, jpos, kpos )
+			for (i = gcx; i < nx - gcx; i++) {
+				index = i * nyz + jpos * nz + kpos;
+
+				Px[i] = interp_bilinear(
+					_py, _pz, py[jpos - 1], pz[kpos - 1],
+					dy, dzh[kpos - 1] + dzh[kpos],
+					X[index - nz - 1], X[index - 1],
+					X[index - nz], X[index]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_yz, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_yz);
+		if (host_rank >= 0)
+			mpi_broadcast(Px, nx, host_rank, mpi_com.comm_yz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::u_profile_at_yz(T* Px, const T* X, const T _py, const T _pz) const
+	{
+		int i, j = locate_y(_py), k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Px, nx);
+		if ((j >= gcy) && (j < ny - gcy) && (k >= gcz) && (k < nz - gcz)) {
+
+			int jpos = (_py < py[j]) ? j : j + 1;
+			int kpos = (_pz < pz[k]) ? k : k + 1;
+
+#pragma omp parallel for private( i, index ) shared( Px, jpos, kpos )
+			for (i = gcx; i < nx - gcx; i++) {
+				index = i * nyz + jpos * nz + kpos;
+
+				Px[i] = interp_trilinear(
+					px[i], _py, _pz, px[i] - dxh, py[jpos - 1], pz[kpos - 1],
+					dx, dy, dzh[kpos - 1] + dzh[kpos],
+					X[index - nz - 1], X[index + nyz - nz - 1],
+					X[index - 1], X[index + nyz - 1],
+					X[index - nz], X[index + nyz - nz],
+					X[index], X[index + nyz]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_yz, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_yz);
+		if (host_rank >= 0)
+			mpi_broadcast(Px, nx, host_rank, mpi_com.comm_yz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::v_profile_at_yz(T* Px, const T* X, const T _py, const T _pz) const
+	{
+		int i, j = locate_y(_py), k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Px, nx);
+		if ((j >= gcy) && (j < ny - gcy) && (k >= gcz) && (k < nz - gcz)) {
+
+			int jpos = j;
+			int kpos = (_pz < pz[k]) ? k : k + 1;
+
+			// trilinear interpolation == bilinear interpolation ( _px == _cx )
+#pragma omp parallel for private( i, index ) shared( Px, jpos, kpos )
+			for (i = gcx; i < nx - gcx; i++) {
+				index = i * nyz + jpos * nz + kpos;
+
+				Px[i] = interp_bilinear(
+					_py, _pz, py[jpos] - dyh, pz[kpos - 1], dy, dzh[kpos - 1] + dzh[kpos],
+					X[index - 1], X[index + nz - 1],
+					X[index], X[index + nz]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_yz, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_yz);
+		if (host_rank >= 0)
+			mpi_broadcast(Px, nx, host_rank, mpi_com.comm_yz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::w_profile_at_yz(T* Px, const T* X, const T _py, const T _pz) const
+	{
+		int i, j = locate_y(_py), k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Px, nx);
+		if ((j >= gcy) && (j < ny - gcy) && (k >= gcz) && (k < nz - gcz)) {
+
+			int jpos = (_py < py[j]) ? j : j + 1;
+			int kpos = k;
+
+			// trilinear interpolation == bilinear interpolation ( _px == _cx )
+#pragma omp parallel for private( i, index ) shared( Px, jpos, kpos )
+			for (i = gcx; i < nx - gcx; i++) {
+				index = i * nyz + jpos * nz + kpos;
+
+				Px[i] = interp_bilinear(
+					_py, _pz, py[jpos - 1], pz[kpos] - dzh[kpos], dy, dz[kpos],
+					X[index - nz], X[index],
+					X[index - nz + 1], X[index + 1]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_yz, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_yz);
+		if (host_rank >= 0)
+			mpi_broadcast(Px, nx, host_rank, mpi_com.comm_yz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::c_profile_at_xz(T* Py, const T* X, const T _px, const T _pz) const
+	{
+		int i = locate_x(_px), j, k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Py, ny);
+		if ((i >= gcx) && (i < nx - gcx) && (k >= gcz) && (k < nz - gcz)) {
+
+			int ipos = (_px < px[i]) ? i : i + 1;
+			int kpos = (_pz < pz[k]) ? k : k + 1;
+
+			// trilinear interpolation == bilinear interpolation ( _py == _cy )
+#pragma omp parallel for private( j, index ) shared( Py, ipos, kpos )
+			for (j = gcy; j < ny - gcy; j++) {
+				index = ipos * nyz + j * nz + kpos;
+
+				Py[j] = interp_bilinear(
+					_px, _pz, px[ipos - 1], pz[kpos - 1], dx, dzh[kpos - 1] + dzh[kpos],
+					X[index - nyz - 1], X[index - 1],
+					X[index - nyz], X[index]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_xz, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_xz);
+		if (host_rank >= 0)
+			mpi_broadcast(Py, ny, host_rank, mpi_com.comm_xz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::u_profile_at_xz(T* Py, const T* X, const T _px, const T _pz) const
+	{
+		int i = locate_x(_px), j, k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Py, ny);
+		if ((i >= gcx) && (i < nx - gcx) && (k >= gcz) && (k < nz - gcz)) {
+
+			int ipos = i;
+			int kpos = (_pz < pz[k]) ? k : k + 1;
+
+			// trilinear interpolation == bilinear interpolation ( _py == _cy )
+#pragma omp parallel for private( j, index ) shared( Py, ipos, kpos )
+			for (j = gcy; j < ny - gcy; j++) {
+				index = ipos * nyz + j * nz + kpos;
+
+				Py[j] = interp_bilinear(
+					_px, _pz, px[ipos] - dxh, pz[kpos - 1], dx, dzh[kpos - 1] + dzh[kpos],
+					X[index - 1], X[index + nyz - 1],
+					X[index], X[index + nyz]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_xz, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_xz);
+		if (host_rank >= 0)
+			mpi_broadcast(Py, ny, host_rank, mpi_com.comm_xz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::v_profile_at_xz(T* Py, const T* X, const T _px, const T _pz) const
+	{
+		int i = locate_x(_px), j, k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Py, ny);
+		if ((i >= gcx) && (i < nx - gcx) && (k >= gcz) && (k < nz - gcz)) {
+
+			int ipos = (_px < px[i]) ? i : i + 1;
+			int kpos = (_pz < pz[k]) ? k : k + 1;
+
+#pragma omp parallel for private( j, index ) shared( Py, ipos, kpos )
+			for (j = gcy; j < ny - gcy; j++) {
+				index = ipos * nyz + j * nz + kpos;
+
+				Py[j] = interp_trilinear(
+					_px, py[j], _pz, px[ipos - 1], py[j] - dyh, pz[kpos - 1],
+					dx, dy, dzh[kpos - 1] + dzh[kpos],
+					X[index - nyz - 1], X[index - 1],
+					X[index - nyz + nz - 1], X[index + nz - 1],
+					X[index - nyz], X[index],
+					X[index - nyz + nz], X[index + nz]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_xz, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_xz);
+		if (host_rank >= 0)
+			mpi_broadcast(Py, ny, host_rank, mpi_com.comm_xz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::w_profile_at_xz(T* Py, const T* X, const T _px, const T _pz) const
+	{
+		int i = locate_x(_px), j, k = locate_z(_pz);
+		int index, host_rank = -1;
+
+		null(Py, ny);
+		if ((i >= gcx) && (i < nx - gcx) && (k >= gcz) && (k < nz - gcz)) {
+
+			int ipos = (_px < px[i]) ? i : i + 1;
+			int kpos = k;
+
+			// trilinear interpolation == bilinear interpolation ( _py == _cy )
+#pragma omp parallel for private( j, index ) shared( Py, ipos, kpos )
+			for (j = gcy; j < ny - gcy; j++) {
+				index = ipos * nyz + j * nz + kpos;
+
+				Py[j] = interp_bilinear(
+					_px, _pz, px[ipos - 1], pz[kpos] - dzh[kpos], dx, dz[kpos],
+					X[index - nyz], X[index],
+					X[index - nyz + 1], X[index + 1]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_xz, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_xz);
+		if (host_rank >= 0)
+			mpi_broadcast(Py, ny, host_rank, mpi_com.comm_xz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::c_profile_at_xy(T* Pz, const T* X, const T _px, const T _py) const
+	{
+		int i = locate_x(_px), j = locate_y(_py), k;
+		int index, host_rank = -1;
+
+		null(Pz, nz);
+		if ((i >= gcx) && (i < nx - gcx) && (j >= gcy) && (j < ny - gcy)) {
+
+			int ipos = (_px < px[i]) ? i : i + 1;
+			int jpos = (_py < py[j]) ? j : j + 1;
+
+			// trilinear interpolation == bilinear interpolation ( _pz == _cz )
+#pragma omp parallel for private( k, index ) shared( Pz, ipos, jpos )
+			for (k = gcz; k < nz - gcz; k++) {
+				index = ipos * nyz + jpos * nz + k;
+
+				Pz[k] = interp_bilinear(
+					_px, _py, px[ipos - 1], py[jpos - 1], dx, dy,
+					X[index - nyz - nz], X[index - nz],
+					X[index - nyz], X[index]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_xy, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_xy);
+		if (host_rank >= 0)
+			mpi_broadcast(Pz, nz, host_rank, mpi_com.comm_xy);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::u_profile_at_xy(T* Pz, const T* X, const T _px, const T _py) const
+	{
+		int i = locate_x(_px), j = locate_y(_py), k;
+		int index, host_rank = -1;
+
+		null(Pz, nz);
+		if ((i >= gcx) && (i < nx - gcx) && (j >= gcy) && (j < ny - gcy)) {
+
+			int ipos = i;
+			int jpos = (_py < py[j]) ? j : j + 1;
+
+			// trilinear interpolation == bilinear interpolation ( _pz == _cz )
+#pragma omp parallel for private( k, index ) shared( Pz, ipos, jpos )
+			for (k = gcz; k < nz - gcz; k++) {
+				index = ipos * nyz + jpos * nz + k;
+
+				Pz[k] = interp_bilinear(
+					_px, _py, px[ipos] - dxh, py[jpos - 1], dx, dy,
+					X[index - nz], X[index + nyz - nz],
+					X[index], X[index + nyz]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_xy, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_xy);
+		if (host_rank >= 0)
+			mpi_broadcast(Pz, nz, host_rank, mpi_com.comm_xy);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::v_profile_at_xy(T* Pz, const T* X, const T _px, const T _py) const
+	{
+		int i = locate_x(_px), j = locate_y(_py), k;
+		int index, host_rank = -1;
+
+		null(Pz, nz);
+		if ((i >= gcx) && (i < nx - gcx) && (j >= gcy) && (j < ny - gcy)) {
+
+			int ipos = (_px < px[i]) ? i : i + 1;
+			int jpos = j;
+
+			// trilinear interpolation == bilinear interpolation ( _pz == _cz )
+#pragma omp parallel for private( k, index ) shared( Pz, ipos, jpos )
+			for (k = gcz; k < nz - gcz; k++) {
+				index = ipos * nyz + jpos * nz + k;
+
+				Pz[k] = interp_bilinear(
+					_px, _py, px[ipos - 1], py[jpos] - dyh, dx, dy,
+					X[index - nyz], X[index],
+					X[index - nyz + nz], X[index + nz]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_xy, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_xy);
+		if (host_rank >= 0)
+			mpi_broadcast(Pz, nz, host_rank, mpi_com.comm_xy);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::w_profile_at_xy(T* Pz, const T* X, const T _px, const T _py) const
+	{
+		int i = locate_x(_px), j = locate_y(_py), k;
+		int index, host_rank = -1;
+
+		null(Pz, nz);
+		if ((i >= gcx) && (i < nx - gcx) && (j >= gcy) && (j < ny - gcy)) {
+
+			int ipos = (_px < px[i]) ? i : i + 1;
+			int jpos = (_py < py[j]) ? j : j + 1;
+
+#pragma omp parallel for private( k, index ) shared( Pz, ipos, jpos )
+			for (k = gcz; k < nz - gcz; k++) {
+				index = ipos * nyz + jpos * nz + k;
+
+				Pz[k] = interp_trilinear(
+					_px, _py, pz[k], px[ipos - 1], py[jpos - 1], pz[k] - dzh[k],
+					dx, dy, dz[k],
+					X[index - nyz - nz], X[index - nz],
+					X[index - nyz], X[index],
+					X[index - nyz - nz + 1], X[index - nz + 1],
+					X[index - nyz + 1], X[index + 1]);
+			}
+
+			MPI_Comm_rank(mpi_com.comm_xy, &host_rank);
+		}
+
+		mpi_allreduce(&host_rank, MPI_MAX, mpi_com.comm_xy);
+		if (host_rank >= 0)
+			mpi_broadcast(Pz, nz, host_rank, mpi_com.comm_xy);
+	}
+
+	template< typename T, memType mem >
+	template< nse_const3d::nodeType node, typename PType >
+	void wstGrid3d< T, mem > ::average_by_x(PType* _RESTRICT Pyz, const T* _RESTRICT const X) const
+	{
+		const int shy = (
+			(node == nse_const3d::nodeV) ||
+			(node == nse_const3d::nodeUV) || 
+			(node == nse_const3d::nodeVW) || 
+			(node == nse_const3d::nodeUVW)) ? 1 : 0;
+		const int shz = (
+			(node == nse_const3d::nodeW) ||
+			(node == nse_const3d::nodeUW) || 
+			(node == nse_const3d::nodeVW) || 
+			(node == nse_const3d::nodeUVW)) ? 1 : 0;
+
+		const PType i_div_x = (PType)1.0 / (mpi_nx - 2 * gcx);
+		int i, j, k, odx, index;
+
+		PType pyz_value;
+
+		null(Pyz, ny * nz);
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, odx, index, pyz_value ) shared( Pyz ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, odx, index, pyz_value ) shared( Pyz )
+#endif
+		for (j = gcy; j < ny - gcy + shy; j++) {
+			for (k = gcz; k < nz - gcz + shz; k++) {
+
+				pyz_value = (PType)0;
+				index = gcx * nyz + j * nz + k;
+				for (i = gcx; i < nx - gcx; i++, index += nyz) {
+					pyz_value += (PType)X[index];
+				}
+
+				odx = j * nz + k;
+				Pyz[odx] = pyz_value * i_div_x;
+			}
+		}
+
+		mpi_allreduce_vec(Pyz, ny * nz, MPI_SUM, mpi_com.comm_x);
+	}
+
+	template< typename T, memType mem >
+	template< nse_const3d::nodeType node, typename PType >
+	void wstGrid3d< T, mem > ::average_by_y(PType* _RESTRICT Pxz, const T* _RESTRICT const X) const
+	{
+		const int shx = (
+			(node == nse_const3d::nodeU) ||
+			(node == nse_const3d::nodeUV) || 
+			(node == nse_const3d::nodeUW) || 
+			(node == nse_const3d::nodeUVW)) ? 1 : 0;
+		const int shz = (
+			(node == nse_const3d::nodeW) ||
+			(node == nse_const3d::nodeUW) || 
+			(node == nse_const3d::nodeVW) || 
+			(node == nse_const3d::nodeUVW)) ? 1 : 0;
+
+		const PType i_div_y = (PType)1.0 / (mpi_ny - 2 * gcy);
+		int i, j, k, odx, index;
+
+		PType pxz_value;
+
+		null(Pxz, nx * nz);
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, odx, index, pxz_value ) shared( Pxz ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, odx, index, pxz_value ) shared( Pxz )
+#endif
+		for (i = gcx; i < nx - gcx + shx; i++) {
+			for (k = gcz; k < nz - gcz + shz; k++) {
+
+				pxz_value = (PType)0;
+				index = i * nyz + gcy * nz + k;
+				for (j = gcy; j < ny - gcy; j++, index += nz) {
+					pxz_value += (PType)X[index];
+				}
+
+				odx = i * nz + k;
+				Pxz[odx] = pxz_value * i_div_y;
+			}
+		}
+
+		mpi_allreduce_vec(Pxz, nx * nz, MPI_SUM, mpi_com.comm_y);
+	}
+
+	template< typename T, memType mem >
+	template< nse_const3d::nodeType node, typename PType >
+	void wstGrid3d< T, mem > ::average_by_z(PType* _RESTRICT Pxy, const T* _RESTRICT const X) const
+	{
+		const int shx = (
+			(node == nse_const3d::nodeU) ||
+			(node == nse_const3d::nodeUV) || 
+			(node == nse_const3d::nodeUW) || 
+			(node == nse_const3d::nodeUVW)) ? 1 : 0;
+		const int shy = (
+			(node == nse_const3d::nodeV) ||
+			(node == nse_const3d::nodeUV) || 
+			(node == nse_const3d::nodeVW) || 
+			(node == nse_const3d::nodeUVW)) ? 1 : 0;
+
+		const PType i_div_z = (PType)1.0 / (PType)mpi_height;
+		int i, j, k, odx, index;
+
+		PType pxy_value;
+
+		null(Pxy, nx * ny);
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, odx, index, pxy_value ) shared( Pxy ) collapse( 2 )
+#else
+#pragma omp parallel for private( i, j, k, odx, index, pxy_value ) shared( Pxy )
+#endif
+		for (i = gcx; i < nx - gcx + shx; i++) {
+			for (j = gcy; j < ny - gcy + shy; j++) {
+
+				pxy_value = (PType)0;
+				index = i * nyz + j * nz + gcz;
+				for (k = gcz; k < nz - gcz; k++, index++) {
+					pxy_value += (PType)X[index] * (PType)dz[k];
+				}
+
+				odx = i * ny + j;
+				Pxy[odx] = pxy_value * i_div_z;
+			}
+		}
+
+		mpi_allreduce_vec(Pxy, nx * ny, MPI_SUM, mpi_com.comm_z);
+	}
+
+	template< typename T, memType mem >
+	template< nse_const3d::nodeType node, typename PType >
+	void wstGrid3d< T, mem > ::average_by_yz(PType* _RESTRICT Px, const T* _RESTRICT const X) const
+	{
+		const int shx = (
+			(node == nse_const3d::nodeU) ||
+			(node == nse_const3d::nodeUV) || 
+			(node == nse_const3d::nodeUW) || 
+			(node == nse_const3d::nodeUVW)) ? 1 : 0;
+
+		const PType i_div_yz = ((PType)1.0 / mpi_height) *
+			((PType)1.0 / (mpi_ny - 2 * gcy));
+		int i, j, k, index;
+
+		PType px_value;
+
+		null(Px, nx);
+
+#pragma omp parallel for private( i, j, k, index, px_value ) shared( Px )
+		for (i = gcx; i < nx - gcx + shx; i++) {
+
+			px_value = (PType)0;
+			index = i * nyz + gcy * nz;
+			for (j = gcy; j < ny - gcy; j++, index += nz) {
+				for (k = gcz; k < nz - gcz; k++) {
+					px_value += (PType)X[index + k] * (PType)dz[k];
+				}
+			}
+
+			Px[i] = px_value * i_div_yz;
+		}
+
+		mpi_allreduce_vec(Px, nx, MPI_SUM, mpi_com.comm_yz);
+	}
+
+	template< typename T, memType mem >
+	template< nse_const3d::nodeType node, typename PType >
+	void wstGrid3d< T, mem > ::average_by_xz(PType* _RESTRICT Py, const T* _RESTRICT const X) const
+	{
+		const int shy = (
+			(node == nse_const3d::nodeV) ||
+			(node == nse_const3d::nodeUV) || 
+			(node == nse_const3d::nodeVW) || 
+			(node == nse_const3d::nodeUVW)) ? 1 : 0;
+
+		const PType i_div_xz = ((PType)1.0 / mpi_height) *
+			((PType)1.0 / (mpi_nx - 2 * gcx));
+		int i, j, k, index;
+
+		PType py_value;
+
+		null(Py, ny);
+
+#pragma omp parallel for private( i, j, k, index, py_value ) shared( Py )
+		for (j = gcy; j < ny - gcy + shy; j++) {
+
+			py_value = (PType)0;
+			index = gcx * nyz + j * nz;
+			for (i = gcx; i < nx - gcx; i++, index += nyz) {
+				for (k = gcz; k < nz - gcz; k++) {
+					py_value += (PType)X[index + k] * (PType)dz[k];
+				}
+			}
+
+			Py[j] = py_value * i_div_xz;
+		}
+
+		mpi_allreduce_vec(Py, ny, MPI_SUM, mpi_com.comm_xz);
+	}
+
+	template< typename T, memType mem >
+	template< nse_const3d::nodeType node, typename PType >
+	void wstGrid3d< T, mem > ::average_by_xy(PType* _RESTRICT Pz, const T* _RESTRICT const X) const
+	{
+		const int shz = (
+			(node == nse_const3d::nodeW) ||
+			(node == nse_const3d::nodeUW) || 
+			(node == nse_const3d::nodeVW) || 
+			(node == nse_const3d::nodeUVW)) ? 1 : 0;
+
+		const PType i_div_xy = (PType)1.0 / ((mpi_nx - 2 * gcx) * (mpi_ny - 2 * gcy));
+		int i, j, k, index;
+
+#ifndef USE_OMP_PAR_REDUCE_IN_AVG_XY
+		const int kblock_size = 128;
+
+#pragma omp parallel private( i, j, k, index ) shared( Pz )
+		{
+			PType Pz_local[kblock_size];
+			int kbeg, kend;
+
+#pragma omp for
+			for (k = 0; k < nz; k++)
+				Pz[k] = (PType)0;
+
+			for (kbeg = gcz; kbeg < nz - gcz + shz; kbeg += kblock_size) {
+				kend = kbeg + kblock_size - 1;
+				if (kend > nz - gcz + shz - 1) kend = nz - gcz + shz - 1;
+
+				for (k = 0; k <= kend - kbeg; k++)
+					Pz_local[k] = (PType)0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for nowait collapse( 2 )
+#else
+#pragma omp for nowait
+#endif
+				for (i = gcx; i < nx - gcx; i++) {
+					for (j = gcy; j < ny - gcy; j++) {
+						index = i * nyz + j * nz;
+						for (k = kbeg; k <= kend; k++)
+							Pz_local[k - kbeg] += (PType)X[index + k];
+					}
+				}
+
+#pragma omp critical
+				for (k = kbeg; k <= kend; k++) {
+					Pz[k] += Pz_local[k - kbeg];
+				}
+
+			}
+
+#pragma omp barrier
+
+#pragma omp for nowait
+			for (k = gcz; k < nz - gcz + shz; k++)
+				Pz[k] *= i_div_xy;
+		}
+
+#else
+		PType *Pz_shared;
+		int max_threads = omp_get_max_threads();
+		int buf_id = memStx::get_buf(&Pz_shared, nz * max_threads);
+
+		null(Pz, nz);
+		null(Pz_shared, nz * max_threads);
+
+#pragma omp parallel private(i, j, k, index) shared( Pz, Pz_shared )
+		{
+			PType pz_value;
+			int tid = omp_get_thread_num();
+			int nthreads = omp_get_num_threads();
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp for collapse( 2 )
+#else
+#pragma omp for
+#endif
+			for (i = gcx; i < nx - gcx; i++) {
+				for (j = gcy; j < ny - gcy; j++) {
+					index = i * nyz + j * nz;
+					for (k = gcz; k < nz - gcz + shz; k++) {
+						Pz_shared[tid * nz + k] += (PType)X[index + k];
+					}
+				}
+			}
+
+#pragma omp for nowait
+			for (k = gcz; k < nz - gcz + shz; k++)
+			{
+				pz_value = (PType)0;
+				for (i = 0; i < nthreads; i++) {
+					pz_value += Pz_shared[i * nz + k];
+				}
+
+				Pz[k] = pz_value * i_div_xy;
+			}
+		}
+
+		memStx::free_buf(buf_id);
+#endif
+
+		mpi_allreduce_vec(Pz, nz, MPI_SUM, mpi_com.comm_xy);
+	}
+
+	template< typename T, memType mem >
+	template< typename PType >
+	PType wstGrid3d< T, mem > ::average(const T* _RESTRICT const X) const
+	{
+		const PType i_div_xyz = ((PType)1.0 / (PType)mpi_height) *
+			((PType)1.0 / ((mpi_nx - 2 * gcx) * (mpi_ny - 2 * gcy)));
+		int i, j, k, index;
+#ifndef USE_OPENMP_2D_CYCLE
+		int shidx;
+#endif
+
+		PType avg = (PType)0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, index ) reduction( + : avg ) collapse( 2 )
+		for (i = gcx; i < nx - gcx; i++)
+		{
+			for (j = gcy; j < ny - gcy; j++)
+			{
+				index = i * nyz + j * nz + gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, index ) reduction( + : avg )
+		for (i = gcx; i < nx - gcx; i++)
+		{
+			shidx = i * nyz + gcy * nz + gcz;
+			for (j = gcy; j < ny - gcy; j++, shidx += nz)
+			{
+				index = shidx;
+#endif
+				for (k = gcz; k < nz - gcz; k++, index++) {
+					avg += (PType)X[index] * (PType)dz[k];
+				}
+			}
+		}
+
+		avg *= i_div_xyz;
+		return mpi_allreduce(avg, MPI_SUM, mpi_com.comm);
+	}
+
+	template< typename T, memType mem  >
+	template< typename PType, typename CType >
+	PType wstGrid3d< T, mem > ::average_ifeq(const T* _RESTRICT const X,
+		const CType* _RESTRICT const mask, const CType check) const
+	{
+		int i, j, k, index;
+#ifndef USE_OPENMP_2D_CYCLE
+		int shidx;
+#endif
+
+		PType avg = (PType)0;
+		PType vol = (PType)0;
+
+#ifdef USE_OPENMP_2D_CYCLE
+#pragma omp parallel for private( i, j, k, index ) reduction( + : avg, vol ) collapse( 2 )
+		for (i = gcx; i < nx - gcx; i++)
+		{
+			for (j = gcy; j < ny - gcy; j++)
+			{
+				index = i * nyz + j * nz + gcz;
+#else
+#pragma omp parallel for private( i, j, k, shidx, index ) reduction( + : avg, vol )
+		for (i = gcx; i < nx - gcx; i++)
+		{
+			shidx = i * nyz + gcy * nz + gcz;
+			for (j = gcy; j < ny - gcy; j++, shidx += nz)
+			{
+				index = shidx;
+#endif
+				for (k = gcz; k < nz - gcz; k++, index++) {
+					if (mask[index] == check) {
+						avg += (PType)X[index] * (PType)dz[k] * dx * dy;
+						vol += (PType)dz[k] * (PType)dx * (PType)dy;
+					}
+				}
+			}
+		}
+
+		mpi_allreduce(&vol, MPI_SUM, mpi_com.comm);
+		if (vol > (PType)0) avg = avg / vol;
+
+		return mpi_allreduce(avg, MPI_SUM, mpi_com.comm);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem >::grid_reinterp(T* Xout, const T* Xin,
+		const nse_const3d::nodeType node, const GridId< T >& id) const
+	{
+		// constucting new interpolation grid using GridId //
+		T _z, _height;
+		id.domain_dim(3, &_z, &_height);
+
+		T _ksi_z = id.domain_spec(0);
+
+		int _gcx, _gcy, _gcz, _nx, _ny, _nz;
+		id.grid_dim(1, &_nx, &_gcx);
+		id.grid_dim(2, &_ny, &_gcy);
+		id.grid_dim(3, &_nz, &_gcz);
+
+		int _cx = _nx - 2 * _gcx,
+			_cy = _ny - 2 * _gcy,
+			_cz = _nz - 2 * _gcz;
+
+		wstGrid3d< T, memCPU > grid_interp;
+		if (!grid_interp.set(
+			mpi_x, mpi_y, _z,
+			mpi_length, mpi_width, _height,
+			_ksi_z,
+
+			_cx, _cy, _cz,
+			_gcx, _gcy, _gcz,
+
+			mpi_com.size_x, mpi_com.size_y, mpi_com.size_z)) return;
+
+		switch (node)	// re-interpolation //
+		{
+		case nse_const3d::nodeC: c_grid_reinterp(Xout, Xin, grid_interp); break;
+		case nse_const3d::nodeU: u_grid_reinterp(Xout, Xin, grid_interp); break;
+		case nse_const3d::nodeV: v_grid_reinterp(Xout, Xin, grid_interp); break;
+		case nse_const3d::nodeW: w_grid_reinterp(Xout, Xin, grid_interp); break;
+		default: break;
+		}
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::c_grid_reinterp(T* Xout, const T* Xin, 
+		const wstGrid3d< T, memCPU >& grid) const
+	{
+		int i, j, k, ip, jp, kp;
+		T _px, _py, _pz;
+
+		T *mpi_pz;
+		allocate_vnull(&mpi_pz, mpi_nz);
+
+		Grid3d< T, mem >::mpi_gather_center_coord(mpi_pz, 0, nse_const3d::axisZ);
+		mpi_broadcast(mpi_pz, mpi_nz, 0, mpi_com.comm);
+
+		T *pval, *mpi_pval;
+		allocate_vnull(&pval, &mpi_pval, mpi_nz);
+
+		for (i = gcx; i < mpi_nx - gcx; i++)
+		{
+			ip = i_local_coord(i);				// define local -i coordinate
+			_px = mpi_x + (i - gcx) * dx + dxh;	// define global -x coordinate
+			for (j = gcy; j < mpi_ny - gcy; j++)
+			{
+				jp = j_local_coord(j);				// define local -j coordinate
+				_py = mpi_y + (j - gcy) * dy + dyh;	// define global -y coordinate
+
+													// -z interpolation on input grid
+				for (k = gcz; k < mpi_nz - gcz; k++) {
+					_pz = mpi_pz[k];		// define global -z coordinate
+					pval[k] = grid.c_interp(Xin, _px, _py, _pz);
+				}
+
+				mpi_allreduce_vec(pval, mpi_pval, mpi_nz, MPI_SUM, mpi_com.comm);
+
+				for (k = gcz; k < mpi_nz - gcz; k++) {
+					kp = k_local_coord(k);	// define local -k coordinat
+					if ((ip >= 0) && (jp >= 0) && (kp >= 0))
+						Xout[ip * nyz + jp * nz + kp] = mpi_pval[k];
+				}
+			}
+		}
+
+		deallocate(pval, mpi_pval);
+		deallocate(mpi_pz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::u_grid_reinterp(T* Uout, const T* Uin, 
+		const wstGrid3d< T, memCPU >& grid) const
+	{
+		int i, j, k, ip, jp, kp;
+		T _px, _py, _pz;
+
+		T *mpi_pz;
+		allocate_vnull(&mpi_pz, mpi_nz);
+
+		Grid3d< T, mem >::mpi_gather_center_coord(mpi_pz, 0, nse_const3d::axisZ);
+		mpi_broadcast(mpi_pz, mpi_nz, 0, mpi_com.comm);
+
+		T *pval, *mpi_pval;
+		allocate_vnull(&pval, &mpi_pval, mpi_nz);
+
+		for (i = gcx; i < mpi_nx - gcx; i++)
+		{
+			ip = i_local_coord(i);				// define local -i coordinate
+			_px = mpi_x + (i - gcx) * dx;		// define global -x coordinate
+			for (j = gcy; j < mpi_ny - gcy; j++)
+			{
+				jp = j_local_coord(j);				// define local -j coordinate
+				_py = mpi_y + (j - gcy) * dy + dyh;	// define global -y coordinate
+
+													// -z interpolation on input grid
+				for (k = gcz; k < mpi_nz - gcz; k++) {
+					_pz = mpi_pz[k];		// define global -z coordinate
+					pval[k] = grid.u_interp(Uin, _px, _py, _pz);
+				}
+
+				mpi_allreduce_vec(pval, mpi_pval, mpi_nz, MPI_SUM, mpi_com.comm);
+
+				for (k = gcz; k < mpi_nz - gcz; k++) {
+					kp = k_local_coord(k);	// define local -k coordinat
+					if ((ip >= 0) && (jp >= 0) && (kp >= 0))
+						Uout[ip * nyz + jp * nz + kp] = mpi_pval[k];
+				}
+			}
+		}
+
+		deallocate(pval, mpi_pval);
+		deallocate(mpi_pz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::v_grid_reinterp(T* Vout, const T* Vin, 
+		const wstGrid3d< T, memCPU >& grid) const
+	{
+		int i, j, k, ip, jp, kp;
+		T _px, _py, _pz;
+
+		T *mpi_pz;
+		allocate_vnull(&mpi_pz, mpi_nz);
+
+		Grid3d< T, mem >::mpi_gather_center_coord(mpi_pz, 0, nse_const3d::axisZ);
+		mpi_broadcast(mpi_pz, mpi_nz, 0, mpi_com.comm);
+
+		T *pval, *mpi_pval;
+		allocate_vnull(&pval, &mpi_pval, mpi_nz);
+
+		for (i = gcx; i < mpi_nx - gcx; i++)
+		{
+			ip = i_local_coord(i);				// define local -i coordinate
+			_px = mpi_x + (i - gcx) * dx + dxh;	// define global -x coordinate
+			for (j = gcy; j < mpi_ny - gcy; j++)
+			{
+				jp = j_local_coord(j);				// define local -j coordinate
+				_py = mpi_y + (j - gcy) * dy;		// define global -y coordinate
+
+													// -z interpolation on input grid
+				for (k = gcz; k < mpi_nz - gcz; k++) {
+					_pz = mpi_pz[k];		// define global -z coordinate
+					pval[k] = grid.v_interp(Vin, _px, _py, _pz);
+				}
+
+				mpi_allreduce_vec(pval, mpi_pval, mpi_nz, MPI_SUM, mpi_com.comm);
+
+				for (k = gcz; k < mpi_nz - gcz; k++) {
+					kp = k_local_coord(k);	// define local -k coordinat
+					if ((ip >= 0) && (jp >= 0) && (kp >= 0))
+						Vout[ip * nyz + jp * nz + kp] = mpi_pval[k];
+				}
+			}
+		}
+
+		deallocate(pval, mpi_pval);
+		deallocate(mpi_pz);
+	}
+
+	template< typename T, memType mem >
+	void wstGrid3d< T, mem > ::w_grid_reinterp(T* Wout, const T* Win, 
+		const wstGrid3d< T, memCPU >& grid) const
+	{
+		int i, j, k, ip, jp, kp;
+		T _px, _py, _pz;
+
+		T *mpi_pz, *mpi_dzh;
+		allocate_vnull(&mpi_pz, mpi_nz);
+		allocate_vnull(&mpi_dzh, mpi_nz);
+
+		Grid3d< T, mem >::mpi_gather_center_coord(mpi_pz, 0, nse_const3d::axisZ);
+		mpi_broadcast(mpi_pz, mpi_nz, 0, mpi_com.comm);
+
+		Grid3d< T, mem >::mpi_gather(mpi_dzh, dzh, 0, nse_const3d::axisZ);
+		mpi_broadcast(mpi_dzh, mpi_nz, 0, mpi_com.comm);
+
+		T *pval, *mpi_pval;
+		allocate_vnull(&pval, &mpi_pval, mpi_nz);
+
+		for (i = gcx; i < mpi_nx - gcx; i++)
+		{
+			ip = i_local_coord(i);				// define local -i coordinate
+			_px = mpi_x + (i - gcx) * dx + dxh;	// define global -x coordinate
+			for (j = gcy; j < mpi_ny - gcy; j++)
+			{
+				jp = j_local_coord(j);				// define local -j coordinate
+				_py = mpi_y + (j - gcy) * dy + dyh;	// define global -y coordinate
+
+													// -z interpolation on input grid
+				for (k = gcz; k < mpi_nz - gcz; k++) {
+					_pz = mpi_pz[k] - mpi_dzh[k];	// define global -z coordinate
+					pval[k] = grid.w_interp(Win, _px, _py, _pz);
+				}
+
+				mpi_allreduce_vec(pval, mpi_pval, mpi_nz, MPI_SUM, mpi_com.comm);
+
+				for (k = gcz; k < mpi_nz - gcz; k++) {
+					kp = k_local_coord(k);			// define local -k coordinat
+					if ((ip >= 0) && (jp >= 0) && (kp >= 0))
+						Wout[ip * nyz + jp * nz + kp] = mpi_pval[k];
+				}
+			}
+		}
+
+		deallocate(pval, mpi_pval);
+		deallocate(mpi_pz); deallocate(mpi_dzh);
+	}
+
+	template< typename T, memType mem >
+	bool wstGrid3d< T, mem > ::init_grid(
+		const T _x, const T _y, const T _z,
+		const T _length, const T _width, const T _height,
+		const T _ksi_z, const T* zline,
+		const int _cx, const int _cy, const int _cz,
+		const int _gcx, const int _gcy, const int _gcz)
+	{
+		// -x, -y steps //
+		dx = _length / _cx; dy = _width / _cy;
+
+		dxh = (T) 0.5 * dx; dyh = (T) 0.5 * dy;
+		dxi = _cx / _length; dyi = _cy / _width;
+		dxih = (T) 0.5 * dxi; dyih = (T) 0.5 * dyi;
+		dxiq = (T) 0.25 * dxi; dyiq = (T) 0.25 * dyi;
+		dxiqh = (T) 0.125 * dxi; dyiqh = (T) 0.125 * dyi;
+		dx2i = dxi * dxi; dy2i = dyi * dyi;
+		dx2ih = (T) 0.5 * dx2i; dy2ih = (T) 0.5 * dy2i;
+		dx2iq = (T) 0.25 * dx2i; dy2iq = (T) 0.25 * dy2i;
+		dx2id = (T) 2.0 * dx2i; dy2id = (T) 2.0 * dy2i;
+
+		nx = par_local_size(_cx, mpi_com.rank_x, mpi_com.size_x);
+		gcx = _gcx;
+
+		ny = par_local_size(_cy, mpi_com.rank_y, mpi_com.size_y);
+		gcy = _gcy;
+
+		int shx = _cx / mpi_com.size_x,
+			shy = _cy / mpi_com.size_y;
+
+		x = _x;
+		for (int k = 0; k < mpi_com.rank_x; k++) {
+			x += shx * dx;
+			if (k < _cx % mpi_com.size_x) x += dx;
+		}
+
+		y = _y;
+		for (int k = 0; k < mpi_com.rank_y; k++) {
+			y += shy * dy;
+			if (k < _cy % mpi_com.size_y) y += dy;
+		}
+
+		length = nx * dx;
+		width = ny * dy;
+
+		nx += 2 * gcx;
+		ny += 2 * gcy;
+
+		// base grid parameters: x, y //
+		allocate_vnull(&px, nx); allocate_vnull(&py, ny);
+		allocate_vnull(&ex, nx); allocate_vnull(&ey, ny);
+		for (int i = 0; i < nx; i++) {
+			ex[i] = x + (i - gcx) * dx;
+			px[i] = ex[i] + dxh;
+		}
+		for (int j = 0; j < ny; j++) {
+			ey[j] = y + (j - gcy) * dy;
+			py[j] = ey[j] + dyh;
+		}
+
+		// - z steps
+		ksi_z = _ksi_z;
+
+		// - init -z step constants
+		nz = par_local_size(_cz, mpi_com.rank_z, mpi_com.size_z);
+
+		int pshz = _cz / mpi_com.size_z;
+		int shz = 0;
+		for (int k = 0; k < mpi_com.rank_z; k++) {
+			shz += pshz;
+			if (k < _cz % mpi_com.size_z) shz++;
+		}
+
+		gcz = _gcz; nz += 2 * gcz;
+
+		// - z step constants allocation
+		allocate_vnull(&pz, nz);
+		allocate_vnull(&ez, nz);
+
+		allocate_vnull(&dz, nz);
+		allocate_vnull(&dzh, nz);
+
+		allocate_vnull(&dzi, nz);
+		allocate_vnull(&dzih, nz);
+		allocate_vnull(&dziq, nz);
+
+		allocate_vnull(&dz2i, nz);
+		allocate_vnull(&dz2ih, nz);
+		allocate_vnull(&dz2iq, nz);
+		allocate_vnull(&dz2id, nz);
+
+		allocate_vnull(&dzp, nz);
+		allocate_vnull(&dzm, nz);
+		allocate_vnull(&dzpi, nz);
+		allocate_vnull(&dzmi, nz);
+		allocate_vnull(&dzpih, nz);
+		allocate_vnull(&dzmih, nz);
+		allocate_vnull(&dzp2i, nz);
+		allocate_vnull(&dzm2i, nz);
+
+		int mpi_pcz = (mpi_com.rank_z == 0) ? gcz : 0;
+		int mpi_ncz = (mpi_com.rank_z == mpi_com.size_z - 1) ? gcz : 0;
+
+		for (int k = mpi_pcz; k < nz - mpi_ncz; k++) {
+			dz[k] = zline[shz - gcz + k + 1] - zline[shz - gcz + k];
+
+			dzh[k] = (T) 0.5 * dz[k];         dzi[k] = (T) 1.0 / dz[k];
+			dzih[k] = (T) 0.5 * dzi[k];       dziq[k] = (T) 0.25 * dzi[k];
+			dz2i[k] = dzi[k] * dzi[k];        dz2ih[k] = (T) 0.5 * dz2i[k];
+			dz2iq[k] = (T) 0.25 * dz2i[k];    dz2id[k] = (T) 2.0 * dz2i[k];
+
+			pz[k] = zline[shz - gcz + k] + dzh[k];
+			ez[k] = zline[shz - gcz + k];
+		}
+
+		for (int k = mpi_pcz - 1; k >= 0; k--) {
+			dz[k] = dz[k + 1];
+
+			dzh[k] = dzh[k + 1];        dzi[k] = dzi[k + 1];
+			dzih[k] = dzih[k + 1];      dziq[k] = dziq[k + 1];
+			dz2i[k] = dz2i[k + 1];      dz2ih[k] = dz2ih[k + 1];
+			dz2iq[k] = dz2iq[k + 1];    dz2id[k] = dz2id[k + 1];
+
+			pz[k] = pz[k + 1] - dzh[k + 1] - dzh[k];
+			ez[k] = ez[k + 1] - dz[k];
+		}
+		for (int k = nz - mpi_ncz; k < nz; k++) {
+			dz[k] = dz[k - 1];
+
+			dzh[k] = dzh[k - 1];        dzi[k] = dzi[k - 1];
+			dzih[k] = dzih[k - 1];      dziq[k] = dziq[k - 1];
+			dz2i[k] = dz2i[k - 1];      dz2ih[k] = dz2ih[k - 1];
+			dz2iq[k] = dz2iq[k - 1];    dz2id[k] = dz2id[k - 1];
+
+			pz[k] = pz[k - 1] + dzh[k - 1] + dzh[k];
+			ez[k] = ez[k - 1] + dz[k - 1];
+		}
+
+		for (int k = 0; k < nz - 1; k++)
+			dzp[k] = dz[k] + dz[k + 1];
+		for (int k = 1; k < nz; k++)
+			dzm[k] = dz[k] + dz[k - 1];
+
+		if (mpi_com.rank_z == mpi_com.size_z - 1)
+			dzp[nz - 1] = (T) 2.0 * dz[nz - 1];
+		else
+			dzp[nz - 1] = dz[nz - 1] +
+			zline[shz - gcz + nz + 1] - zline[shz - gcz + nz];
+
+		if (mpi_com.rank_z == 0)
+			dzm[0] = (T) 2.0 * dz[0];
+		else
+			dzm[0] = dz[0] +
+			zline[shz - gcz] - zline[shz - gcz - 1];
+
+		for (int k = 0; k < nz; k++) {
+			dzpi[k] = (T) 1.0 / dzp[k];
+			dzmi[k] = (T) 1.0 / dzm[k];
+
+			dzpih[k] = (T) 0.5 * dzpi[k];
+			dzmih[k] = (T) 0.5 * dzmi[k];
+
+			dzp2i[k] = (T) 2.0 * dzpi[k] * dzi[k];
+			dzm2i[k] = (T) 2.0 * dzmi[k] * dzi[k];
+		}
+
+		z = zline[shz];
+		height = zline[shz + nz - 2 * gcz] - zline[shz];
+
+		dz_min = nse::mpi_min(dz, nz, mpi_com.comm_z);
+		dz_max = nse::mpi_max(dz, nz, mpi_com.comm_z);
+
+		dzi_min = (T)1 / dz_min;
+		dzi_max = (T)1 / dz_max;
+
+		// grid cell number constants //
+		nyz = ny * nz;
+		size = nx * ny * nz;
+
+		// MPI domain parameters //
+		mpi_x = mpi_allreduce(x, MPI_MIN, mpi_com.comm_x);
+		mpi_y = mpi_allreduce(y, MPI_MIN, mpi_com.comm_y);
+		mpi_z = mpi_allreduce(z, MPI_MIN, mpi_com.comm_z);
+
+		mpi_length = mpi_allreduce(length, MPI_SUM, mpi_com.comm_x);
+		mpi_width = mpi_allreduce(width, MPI_SUM, mpi_com.comm_y);
+		mpi_height = mpi_allreduce(height, MPI_SUM, mpi_com.comm_z);
+
+		// MPI grid parameters //
+		mpi_nx = mpi_allreduce(nx - 2 * gcx, MPI_SUM, mpi_com.comm_x) + 2 * gcx;
+		mpi_ny = mpi_allreduce(ny - 2 * gcy, MPI_SUM, mpi_com.comm_y) + 2 * gcy;
+		mpi_nz = mpi_allreduce(nz - 2 * gcz, MPI_SUM, mpi_com.comm_z) + 2 * gcz;
+
+		mpi_nxy = mpi_nx * mpi_ny;
+		mpi_nxz = mpi_nx * mpi_nz;
+		mpi_nyz = mpi_ny * mpi_nz;
+		mpi_size = mpi_nx * mpi_ny * mpi_nz;
+
+		return true;
+	}
+
+
+	// _min + 0.5 * (_max - _min) * (1 + (tanh(nu)/tanh(_Ksi))) : -_Ksi <= nu <= _Ksi
+	template< typename T, memType mem >
+	int wstGrid3d<T, mem>::make_coordinates(
+		T** line,
+		const T _min, const T _max,
+		const int _cx, const T _Ksi)
+	{
+		allocate_vnull(line, _cx + 1);
+		T dnu = ((T)2.0 * _Ksi) / _cx;
+
+		T pnu = -_Ksi;
+		for (int k = 0; k <= _cx; k++) {
+#ifdef USE_ERF_WSTGRID3D
+			(*line)[k] = _min + (T) 0.5 * (_max - _min) * ((T) 1.0 + (erf(pnu) / erf(_Ksi)));
+#else
+			(*line)[k] = _min + (T) 0.5 * (_max - _min) * ((T) 1.0 + (tanh(pnu) / tanh(_Ksi)));
+#endif
+			pnu += dnu;
+		}
+
+		return _cx + 1;
+	}
+
+	// _min + (_max - _min) * (1 + (tanh(nu)/tanh(_Ksi))) : -_Ksi <= nu <= 0
+	template< typename T, memType mem >
+	int wstGrid3d<T, mem>::make_coordinates_inf(
+		T** line,
+		const T _min, const T _max,
+		const int _cx, const T _Ksi)
+	{
+		allocate_vnull(line, _cx + 1);
+		T dnu = (_Ksi) / _cx;
+
+		T pnu = -_Ksi;
+		for (int k = 0; k <= _cx; k++) {
+#ifdef USE_ERF_WSTGRID3D
+			(*line)[k] = _min + (_max - _min) * ((T) 1.0 + (erf(pnu) / erf(_Ksi)));
+#else
+			(*line)[k] = _min + (_max - _min) * ((T) 1.0 + (tanh(pnu) / tanh(_Ksi)));
+#endif
+			pnu += dnu;
+		}
+
+		return _cx + 1;
+	}
+
+	// uniform grid
+	template< typename T, memType mem >
+	int wstGrid3d<T, mem>::make_coordinates_uni(
+		T** line,
+		const T _min, const T _max,
+		const int _cx)
+	{
+		allocate_vnull(line, _cx + 1);
+		T dx = (_max - _min) / _cx;
+
+		for (int k = 0; k <= _cx; k++) 
+		{
+			(*line)[k] = _min + k * dx;
+		}
+
+		return _cx + 1;
+	}
+
+	// _reg_min + 0.5 * (_reg_max - _reg_min) * (1 + (tanh(nu)/tanh(_Ksi))) : -_Ksi <= nu <= _Ksi
+	// _min : _reg_min --- uniform grid with min step
+	// _reg_max : _max --- uniform grid with min step
+	template< typename T, memType mem >
+	int wstGrid3d<T, mem>::make_coordinates(
+		T** line,
+		const T _min, const T _max,
+		const T _reg_min, const T _reg_max,
+		const int _reg_cx, const T _Ksi)
+	{
+		T* reg_line;
+		allocate_vnull(&reg_line, _reg_cx + 1);
+
+		T dnu = ((T)2.0 * _Ksi) / _reg_cx;
+		T pnu = -_Ksi;
+
+		for (int k = 0; k <= _reg_cx; k++) {
+#ifdef USE_ERF_WSTGRID3D
+			reg_line[k] = _reg_min + (T) 0.5 * (_reg_max - _reg_min) * ((T) 1.0 + (erf(pnu) / erf(_Ksi)));
+#else
+			reg_line[k] = _reg_min + (T) 0.5 * (_reg_max - _reg_min) * ((T) 1.0 + (tanh(pnu) / tanh(_Ksi)));
+#endif
+			pnu += dnu;
+		}
+
+		T dwest = reg_line[1] - reg_line[0];
+		T deast = reg_line[_reg_cx] - reg_line[_reg_cx - 1];
+
+		int nwest = ((int)((_reg_min - _min) / dwest)) + 1;
+		int neast = ((int)((_max - _reg_max) / deast)) + 1;
+
+		if (_reg_min == _min) nwest = 0;
+		if (_reg_max == _max) neast = 0;
+
+		if (nwest) dwest = (_reg_min - _min) / nwest;
+		if (neast) deast = (_max - _reg_max) / neast;
+
+		allocate_vnull(line, _reg_cx + 1 + nwest + neast);
+		T px = _min;
+		for (int k = 0; k < nwest; k++) {
+			(*line)[k] = px;
+			px += dwest;
+		}
+
+		memcpy(&((*line)[nwest]), reg_line, (_reg_cx + 1) * sizeof(T));
+
+		px = (*line)[nwest + _reg_cx] + deast;
+		for (int k = nwest + _reg_cx + 1; k < nwest + _reg_cx + 1 + neast; k++) {
+			(*line)[k] = px;
+			px += deast;
+		}
+
+		deallocate(reg_line);
+		return nwest + _reg_cx + 1 + neast;
+	}
+
+	// make stretched grid coordinates to infinity
+	// _min + (_reg - _min) * (tanh(nu)/tanh(_Ksi)) : 0 <= nu <= _Ksi
+	// _reg : _max --- uniform grid with min step
+	template< typename T, memType mem >
+	int wstGrid3d<T, mem>::make_coordinates_inf(
+		T** line,
+		const T _min, const T _max,
+		const T _reg,
+		const int _reg_cx, const T _Ksi)
+	{
+		T* reg_line;
+		allocate_vnull(&reg_line, _reg_cx + 1);
+
+		T dnu = _Ksi / _reg_cx;
+		T pnu = (T)0;
+
+		for (int k = 0; k <= _reg_cx; k++) {
+#ifdef USE_ERF_WSTGRID3D
+			reg_line[k] = _min + (_reg - _min) * (erf(pnu) / erf(_Ksi));
+#else
+			reg_line[k] = _min + (_reg - _min) * (tanh(pnu) / tanh(_Ksi));
+#endif
+			pnu += dnu;
+		}
+
+		T deast = reg_line[_reg_cx] - reg_line[_reg_cx - 1];
+
+		int neast = ((int)((_max - _reg) / deast)) + 1;
+
+		if (_reg == _max) neast = 0;
+
+		if (neast) deast = (_max - _reg) / neast;
+
+		allocate_vnull(line, _reg_cx + 1 + neast);
+
+		memcpy(&((*line)[0]), reg_line, (_reg_cx + 1) * sizeof(T));
+
+		T px = (*line)[_reg_cx] + deast;
+		for (int k = _reg_cx + 1; k < _reg_cx + 1 + neast; k++) {
+			(*line)[k] = px;
+			px += deast;
+		}
+
+		deallocate(reg_line);
+
+		return _reg_cx + 1 + neast;
+	}
+
+	// _min + 0.5 * (_half - _min) * (1 + (tanh(nu)/tanh(_Ksi))) : -_Ksi <= nu <= _Ksi
+	// _half + 0.5 * (_max - _half) * (1 + (tanh(nu)/tanh(_Ksi))) : _Ksi <= nu <= _Ksi
+	template< typename T, memType mem >
+	int wstGrid3d< T, mem > ::make_coordinates_hplane(
+		T** line,
+		const T _min, const T _max,
+		const int _cx, const T _Ksi)
+	{
+		int hcx = _cx / 2;
+		T half = (T)0.5 * (_max - _min);
+
+		allocate_vnull(line, 2 * hcx + 1);
+		T dnu = ((T)2.0 * _Ksi) / hcx;
+
+		T pnu = -_Ksi;
+		for (int k = 0; k <= hcx; k++) {
+#ifdef USE_ERF_WSTGRID3D
+			(*line)[k] = _min + (T) 0.5 * (half - _min) * ((T) 1.0 + (erf(pnu) / erf(_Ksi)));
+#else
+			(*line)[k] = _min + (T) 0.5 * (half - _min) * ((T) 1.0 + (tanh(pnu) / tanh(_Ksi)));
+#endif
+			pnu += dnu;
+		}
+
+		pnu = -_Ksi + dnu;
+		for (int k = hcx + 1; k <= 2 * hcx; k++) {
+#ifdef USE_ERF_WSTGRID3D
+			(*line)[k] = half + (T) 0.5 * (_max - half) * ((T) 1.0 + (erf(pnu) / erf(_Ksi)));
+#else
+			(*line)[k] = half + (T) 0.5 * (_max - half) * ((T) 1.0 + (tanh(pnu) / tanh(_Ksi)));
+#endif
+			pnu += dnu;
+		}
+
+		return 2 * hcx + 1;
+	}
+
+	template< typename T, memType mem >	// GridId setup //
+	void wstGrid3d< T, mem >::set_id(GridId< T >& id) const
+	{
+		Grid3d<T>::set_id(id);
+		id.set_grid_type(2);	// non-uniform grid data flag (wall stretched)
+
+								// domain specifiers //
+#ifdef USE_CXX_11
+		id.set_domain_specs({ ksi_z });
+#else
+		T dspecs[1] = { ksi_z };
+		id.set_domain_specs(1, dspecs);
+#endif
+	}
+
+	template< typename T, memType mem >	// GridId check //
+	bool wstGrid3d< T, mem >::check_id(const GridId< T >& id) const
+	{
+		return (Grid3d<T>::check_id(id) && (id.grid_type() == 2));	// non-uniform data (wall stretched)
+	}
+
+
+	template< typename T, memType mem >	// GridId setup //
+	void wstGrid3d< T, mem >::set_id(GridId< T >& id, const nse_const3d::axisType axis) const
+	{
+		Grid3d<T>::set_id(id, axis);
+		if ((axis == nse_const3d::axisX) || 
+			(axis == nse_const3d::axisY) || 
+			(axis == nse_const3d::axisXY)) 
+		{
+			id.set_grid_type(0);	// uniform grid data flag
+			return;
+		}
+
+		id.set_grid_type(2);	// non-uniform grid data flag (wall stretched)
+
+		// domain specifiers //
+#ifdef USE_CXX_11
+		id.set_domain_specs({ ksi_z });
+#else
+		T dspecs[1] = { ksi_z };
+		id.set_domain_specs(1, dspecs);
+#endif
+	}
+
+	template< typename T, memType mem >	// GridId check //
+	bool wstGrid3d< T, mem >::check_id(const GridId< T >& id, const nse_const3d::axisType axis) const
+	{
+		if ((axis == nse_const3d::axisX) || 
+			(axis == nse_const3d::axisY) || 
+			(axis == nse_const3d::axisXY)) 
+		{
+			return (Grid3d<T>::check_id(id, axis) && (id.grid_type() == 0));	// uniform data
+		}
+
+		return (Grid3d<T>::check_id(id, axis) && (id.grid_type() == 2));	// non-uniform data (wall stretched)
+	}
+
+}