#include "plutils.h"

#include <stdlib.h>
#include <string.h>

// parLIB buffers [definition]
// -------------------------------------------------------------------------- //
void *plbuf[MAX_PL_BUFS];
int plbuf_size[MAX_PL_BUFS];
int plbuf_status[MAX_PL_BUFS];

int plbuf_ptr;
// -------------------------------------------------------------------------- //

// using loop_count pragma expectations
// ----------------------------------------------------------------------------------- //
#define CP_EXPECT_5D_J1		21			// number of vertical levels[1]
#define CP_EXPECT_5D_J2		73			// number of vertical levels[2]
#define CP_EXPECT_5D_K1		1			// number of variables[1]
#define CP_EXPECT_5D_K2		2			// number of variables[2]
#define CP_EXPECT_5D_K3		5			// number of variables[3]
#define CP_EXPECT_5D_K4		10			// number of variables[4]
#define CP_EXPECT_5D_Q1		1			// number of time scices[1]
#define CP_EXPECT_5D_Q2		2			// number of time scices[2]
// ----------------------------------------------------------------------------------- //


// buffer memory interface
// ----------------------------------------------------------------------------------- //
void init_plbuf()
{
	int k;
	for (k = 0; k < MAX_PL_BUFS; k++) {
		plbuf_size[k] = 0;
		plbuf_status[k] = 0;
	}
	plbuf_ptr = 0;
}

void deinit_plbuf()
{
	int k;
	for (k = 0; k < MAX_PL_BUFS; k++) {
		if (plbuf_size[k] > 0) {
			free(plbuf[k]);
			plbuf_size[k] = 0;
		}
		plbuf_status[k] = 0;
	}
	plbuf_ptr = 0;
}


void* get_plbuf(int msize, int* id)
{
	int k, kbeg = plbuf_ptr;

	for (k = kbeg; k < MAX_PL_BUFS; k++) {
		if (!plbuf_status[k]) {
			if (msize > plbuf_size[k]) {
				if (plbuf_size[k] > 0) free(plbuf[k]);
				plbuf_size[k] = msize;
				plbuf[k] = (void*)malloc(plbuf_size[k]);
			}

			plbuf_status[k] = 1;
			plbuf_ptr = k + 1;

			(*id) = k;
			return plbuf[k];
		}
	}

	// no free buffer found:
	(*id) = MAX_PL_BUFS;
	return (void*)malloc(msize);
}

void free_plbuf(void* ptr, int id)
{
	if (id < 0) return;
	if (id >= MAX_PL_BUFS) {
		free(ptr);
		return;
	}

	plbuf_status[id] = 0;
	if (id < plbuf_ptr) plbuf_ptr = id;
}
// ----------------------------------------------------------------------------------- //

// 1D copy
// ----------------------------------------------------------------------------------- //
static inline void copy_to_buffer_1d(char* _RESTRICT buf, const char* _RESTRICT const a,
	const int nx)
{
	int i;

	if (nx < MIN_MEMCPY_BLOCK)
	for (i = 0; i < nx; i++)
		buf[i] = a[i];
	else
		memcpy(buf, a, nx * sizeof(char));
}

static inline void copy_from_buffer_1d(char* _RESTRICT a, const char* _RESTRICT const buf,
	const int nx)
{
	int i;

	if (nx < MIN_MEMCPY_BLOCK)
	for (i = 0; i < nx; i++)
		a[i] = buf[i];
	else
		memcpy(a, buf, nx * sizeof(char));
}
// ----------------------------------------------------------------------------------- //

// 2D copy
// ----------------------------------------------------------------------------------- //
static inline void copy_to_buffer_2d(char* _RESTRICT buf, const char* _RESTRICT const a,
	const int nx, const int ny,
	const int shx)
{
	int i, j, idx = 0, bidx = 0;

	if (nx < MIN_MEMCPY_BLOCK)
	for (j = 0; j < ny; j++, idx += shx, bidx += nx)
	for (i = 0; i < nx; i++) {
		buf[bidx + i] = a[idx + i];
	}
	else
	{
		const int nbx = nx * sizeof(char);
		for (j = 0; j < ny; j++, bidx += nx) {
			idx = j * shx;
			memcpy(&buf[bidx], &a[idx], nbx);
		}
	}
}

static inline void copy_from_buffer_2d(char* _RESTRICT a, const char* _RESTRICT const buf,
	const int nx, const int ny,
	const int shx)
{
	int i, j, idx = 0, bidx = 0;

	if (nx < MIN_MEMCPY_BLOCK)
	for (j = 0; j < ny; j++, idx += shx, bidx += nx)
	for (i = 0; i < nx; i++) {
		a[idx + i] = buf[bidx + i];
	}
	else
	{
		const int nbx = nx * sizeof(char);
		for (j = 0; j < ny; j++, bidx += nx) {
			idx = j * shx;
			memcpy(&a[idx], &buf[bidx], nbx);
		}
	}
}
// ----------------------------------------------------------------------------------- //

// 3D copy
// ----------------------------------------------------------------------------------- //
static inline void copy_to_buffer_3d(char* _RESTRICT buf, const char* _RESTRICT const a,
	const int nx, const int ny, const int nz,
	const int shx, const int shxy)
{
	int i, j, k, idx, bidx = 0;

	if (nx < MIN_MEMCPY_BLOCK)
	for (k = 0; k < nz; k++)
	{
		idx = k * shxy;
		for (j = 0; j < ny; j++, idx += shx, bidx += nx)
		for (i = 0; i < nx; i++) {
			buf[bidx + i] = a[idx + i];
		}
	}
	else
	{
		const int nbx = nx * sizeof(char);

		for (k = 0; k < nz; k++)
		{
			idx = k * shxy;
			for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
				memcpy(&buf[bidx], &a[idx], nbx);
			}
		}
	}
}

static inline void copy_from_buffer_3d(char* _RESTRICT a, const char* _RESTRICT const buf,
	const int nx, const int ny, const int nz,
	const int shx, const int shxy)
{
	int i, j, k, idx, bidx = 0;

	if (nx < MIN_MEMCPY_BLOCK)
	for (k = 0; k < nz; k++)
	{
		idx = k * shxy;
		for (j = 0; j < ny; j++, idx += shx, bidx += nx)
		for (i = 0; i < nx; i++) {
			a[idx + i] = buf[bidx + i];
		}
	}
	else
	{
		const int nbx = nx * sizeof(char);

		for (k = 0; k < nz; k++)
		{
			idx = k * shxy;
			for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
				memcpy(&a[idx], &buf[bidx], nbx);
			}
		}
	}
}
// ----------------------------------------------------------------------------------- //

// 4D copy
// ----------------------------------------------------------------------------------- //
static inline void copy_to_buffer_4d(char* _RESTRICT buf, const char* _RESTRICT const a,
	const int nx, const int ny, const int nz, const int np,
	const int shx, const int shxy, const int shxyz)
{
	int i, j, k, p, idx, shidx, bidx = 0;

	if (nx < MIN_MEMCPY_BLOCK)
	for (p = 0; p < np; p++)
	{
		shidx = p * shxyz;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
#endif
		for (k = 0; k < nz; k++, shidx += shxy)
		{
			idx = shidx;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
#endif
			for (j = 0; j < ny; j++, idx += shx, bidx += nx)
			for (i = 0; i < nx; i++) {
				buf[bidx + i] = a[idx + i];
			}
		}
	}
	else
	for (p = 0; p < np; p++)
	{
		shidx = p * shxyz;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
#endif
		for (k = 0; k < nz; k++, shidx += shxy)
		{
			idx = shidx;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
#endif
			for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
				memcpy(&buf[bidx], &a[idx], nx * sizeof(char));
			}
		}
	}
}

static inline void copy_from_buffer_4d(char* _RESTRICT a, const char* _RESTRICT const buf,
	const int nx, const int ny, const int nz, const int np,
	const int shx, const int shxy, const int shxyz)
{
	int i, j, k, p, idx, shidx, bidx = 0;

	if (nx < MIN_MEMCPY_BLOCK)
	for (p = 0; p < np; p++)
	{
		shidx = p * shxyz;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
#endif
		for (k = 0; k < nz; k++, shidx += shxy)
		{
			idx = shidx;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
#endif
			for (j = 0; j < ny; j++, idx += shx, bidx += nx)
			for (i = 0; i < nx; i++) {
				a[idx + i] = buf[bidx + i];
			}
		}
	}
	else
	for (p = 0; p < np; p++)
	{
		shidx = p * shxyz;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
#endif
		for (k = 0; k < nz; k++, shidx += shxy)
		{
			idx = shidx;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
#endif
			for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
				memcpy(&a[idx], &buf[bidx], nx * sizeof(char));
			}
		}
	}
}
// ----------------------------------------------------------------------------------- //

// 5D copy
// ----------------------------------------------------------------------------------- //
static inline void copy_to_buffer_5d(char* _RESTRICT buf, const char* _RESTRICT const a,
	const int nx, const int ny, const int nz, const int np, const int nq,
	const int shx, const int shxy, const int shxyz, const int shxyzp)
{
	int i, j, k, p, q, shidx_q, shidx_p, idx, bidx = 0;

	if (nx < MIN_MEMCPY_BLOCK)
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_Q1, CP_EXPECT_5D_Q2)
#endif
	for (q = 0; q < nq; q++)
	{
		shidx_q = q * shxyzp;
		for (p = 0; p < np; p++, shidx_q += shxyz)
		{
			shidx_p = shidx_q;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
#endif
			for (k = 0; k < nz; k++, shidx_p += shxy)
			{
				idx = shidx_p;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
#endif
				for (j = 0; j < ny; j++, idx += shx, bidx += nx)
				for (i = 0; i < nx; i++) {
					buf[bidx + i] = a[idx + i];
				}
			}
		}
	}
	else
	{
		const int nbx = nx * sizeof(char);

#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_Q1, CP_EXPECT_5D_Q2)
#endif
		for (q = 0; q < nq; q++)
		{
			shidx_q = q * shxyzp;
			for (p = 0; p < np; p++, shidx_q += shxyz)
			{
				shidx_p = shidx_q;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
#endif
				for (k = 0; k < nz; k++, shidx_p += shxy)
				{
					idx = shidx_p;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
#endif
					for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
						memcpy(&buf[bidx], &a[idx], nbx);
					}
				}
			}
		}
	}
}

static inline void copy_from_buffer_5d(char* _RESTRICT a, const char* _RESTRICT const buf,
	const int nx, const int ny, const int nz, const int np, const int nq,
	const int shx, const int shxy, const int shxyz, const int shxyzp)
{
	int i, j, k, p, q, shidx_q, shidx_p, idx, bidx = 0;

	if (nx < MIN_MEMCPY_BLOCK)
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_Q1, CP_EXPECT_5D_Q2)
#endif
	for (q = 0; q < nq; q++)
	{
		shidx_q = q * shxyzp;
		for (p = 0; p < np; p++, shidx_q += shxyz)
		{
			shidx_p = shidx_q;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
#endif
			for (k = 0; k < nz; k++, shidx_p += shxy)
			{
				idx = shidx_p;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
#endif
				for (j = 0; j < ny; j++, idx += shx, bidx += nx)
				for (i = 0; i < nx; i++) {
					a[idx + i] = buf[bidx + i];
				}
			}
		}
	}
	else
	{
		const int nbx = nx * sizeof(char);

#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_Q1, CP_EXPECT_5D_Q2)
#endif
		for (q = 0; q < nq; q++)
		{
			shidx_q = q * shxyzp;
			for (p = 0; p < np; p++, shidx_q += shxyz)
			{
				shidx_p = shidx_q;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
#endif
				for (k = 0; k < nz; k++, shidx_p += shxy)
				{
					idx = shidx_p;
#if defined(__INTEL_COMPILER)
#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
#endif
					for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
						memcpy(&a[idx], &buf[bidx], nbx);
					}
				}
			}
		}
	}
}
// ----------------------------------------------------------------------------------- //


// 6D copy
// ----------------------------------------------------------------------------------- //
static inline void copy_to_buffer_6d(char* _RESTRICT buf, const char* _RESTRICT const a,
	const int nx, const int ny, const int nz, const int np, const int nq, const int ns,
	const int shx, const int shxy, const int shxyz, const int shxyzp, const int shxyzpq)
{
	int i, j, k, p, q, s, idx, shidx_q, shidx_p, bidx = 0;

	if (nx < MIN_MEMCPY_BLOCK)
	for (s = 0; s < ns; s++)
	for (q = 0; q < nq; q++)
	{
		shidx_q = s * shxyzpq + q * shxyzp;
		for (p = 0; p < np; p++, shidx_q += shxyz)
		{
			shidx_p = shidx_q;
			for (k = 0; k < nz; k++, shidx_p += shxy)
			{
				idx = shidx_p;
				for (j = 0; j < ny; j++, idx += shx, bidx += nx)
				for (i = 0; i < nx; i++) {
					buf[bidx + i] = a[idx + i];
				}
			}
		}
	}
	else
	{
		const int nbx = nx * sizeof(char);

		for (s = 0; s < ns; s++)
		for (q = 0; q < nq; q++)
		{
			shidx_q = s * shxyzpq + q * shxyzp;
			for (p = 0; p < np; p++, shidx_q += shxyz)
			{
				shidx_p = shidx_q;
				for (k = 0; k < nz; k++, shidx_p += shxy)
				{
					idx = shidx_p;
					for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
						memcpy(&buf[bidx], &a[idx], nbx);
					}
				}
			}
		}
	}
}

static inline void copy_from_buffer_6d(char* _RESTRICT a, const char* _RESTRICT const buf,
	const int nx, const int ny, const int nz, const int np, const int nq, const int ns,
	const int shx, const int shxy, const int shxyz, const int shxyzp, const int shxyzpq)
{
	int i, j, k, p, q, s, idx, shidx_q, shidx_p, bidx = 0;

	if (nx < MIN_MEMCPY_BLOCK)
	for (s = 0; s < ns; s++)
	for (q = 0; q < nq; q++)
	{
		shidx_q = s * shxyzpq + q * shxyzp;
		for (p = 0; p < np; p++, shidx_q += shxyz)
		{
			shidx_p = shidx_q;
			for (k = 0; k < nz; k++, shidx_p += shxy)
			{
				idx = shidx_p;
				for (j = 0; j < ny; j++, idx += shx, bidx += nx)
				for (i = 0; i < nx; i++) {
					a[idx + i] = buf[bidx + i];
				}
			}
		}
	}
	else
	{
		const int nbx = nx * sizeof(char);

		for (s = 0; s < ns; s++)
		for (q = 0; q < nq; q++)
		{
			shidx_q = s * shxyzpq + q * shxyzp;
			for (p = 0; p < np; p++, shidx_q += shxyz)
			{
				shidx_p = shidx_q;
				for (k = 0; k < nz; k++, shidx_p += shxy)
				{
					idx = shidx_p;
					for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
						memcpy(&a[idx], &buf[bidx], nbx);
					}
				}
			}
		}
	}
}
// ----------------------------------------------------------------------------------- //


// COPY-TO
// ----------------------------------------------------------------------------------- //
void copy_to_buffer(char* _RESTRICT buf, const char* _RESTRICT const a,
	const int ndims,
	const int* _RESTRICT const msgdim,
	const int* _RESTRICT const stride,
	const int fsize)
{
	if ((ndims < 1) || (ndims > MAX_PARLIB_MP_DIMS)) return;

	if (ndims == 1) {
		const int nx = msgdim[0] * fsize;

		copy_to_buffer_1d(buf, a, nx);
		return;
	}

	if (ndims == 2) {
		const int nx = msgdim[0] * fsize,
			ny = msgdim[1];

		const int shx = stride[0] * fsize;

		copy_to_buffer_2d(buf, a, nx, ny, 
			shx);
		return;
	}

	if (ndims == 3) {
		const int nx = msgdim[0] * fsize,
			ny = msgdim[1], nz = msgdim[2];

		const int shx = stride[0] * fsize;
		const int shxy = stride[1] * shx;

		copy_to_buffer_3d(buf, a, nx, ny, nz,
			shx, shxy);
		return;
	}

	if (ndims == 4) {
		const int nx = msgdim[0] * fsize,
			ny = msgdim[1], nz = msgdim[2], 
			np = msgdim[3];

		const int shx = stride[0] * fsize;
		const int shxy = stride[1] * shx;
		const int shxyz = stride[2] * shxy;

		copy_to_buffer_4d(buf, a, nx, ny, nz, np,
			shx, shxy, shxyz);
		return;
	}

	if (ndims == 5) {
		const int nx = msgdim[0] * fsize,
			ny = msgdim[1], nz = msgdim[2], 
			np = msgdim[3], nq = msgdim[4];

		const int shx = stride[0] * fsize;
		const int shxy = stride[1] * shx;
		const int shxyz = stride[2] * shxy;
		const int shxyzp = stride[3] * shxyz;

		copy_to_buffer_5d(buf, a, nx, ny, nz, np, nq,
			shx, shxy, shxyz, shxyzp);
		return;
	}

	if (ndims == 6) {
		const int nx = msgdim[0] * fsize, 
			ny = msgdim[1], nz = msgdim[2],
			np = msgdim[3], nq = msgdim[4],
			ns = msgdim[5];

		const int shx = stride[0] * fsize;
		const int shxy = stride[1] * shx;
		const int shxyz = stride[2] * shxy;
		const int shxyzp = stride[3] * shxyz;
		const int shxyzpq = stride[4] * shxyzp;

		copy_to_buffer_6d(buf, a, nx, ny, nz, np, nq, ns,
			shx, shxy, shxyz, shxyzp, shxyzpq);
		return;
	}
}
// ----------------------------------------------------------------------------------- //

// COPY-FROM
// ----------------------------------------------------------------------------------- //
void copy_from_buffer(char* _RESTRICT a, const char* _RESTRICT const buf,
	const int ndims,
	const int* _RESTRICT const msgdim,
	const int* _RESTRICT const stride,
	const int fsize)
{
	if ((ndims < 1) || (ndims > MAX_PARLIB_MP_DIMS)) return;

	if (ndims == 1) {
		const int nx = msgdim[0] * fsize;

		copy_from_buffer_1d(a, buf, nx);
		return;
	}

	if (ndims == 2) {
		const int nx = msgdim[0] * fsize,
			ny = msgdim[1];

		const int shx = stride[0] * fsize;

		copy_from_buffer_2d(a, buf, nx, ny,
			shx);
		return;
	}
	
	if (ndims == 3) {
		const int nx = msgdim[0] * fsize,
			ny = msgdim[1], nz = msgdim[2];

		const int shx = stride[0] * fsize;
		const int shxy = stride[1] * shx;

		copy_from_buffer_3d(a, buf, nx, ny, nz,
			shx, shxy);
		return;
	}
	
	if (ndims == 4) {
		const int nx = msgdim[0] * fsize,
			ny = msgdim[1], nz = msgdim[2],
			np = msgdim[3];

		const int shx = stride[0] * fsize;
		const int shxy = stride[1] * shx;
		const int shxyz = stride[2] * shxy;

		copy_from_buffer_4d(a, buf, nx, ny, nz, np,
			shx, shxy, shxyz);
		return;
	}

	if (ndims == 5) {
		const int nx = msgdim[0] * fsize,
			ny = msgdim[1], nz = msgdim[2],
			np = msgdim[3], nq = msgdim[4];

		const int shx = stride[0] * fsize;
		const int shxy = stride[1] * shx;
		const int shxyz = stride[2] * shxy;
		const int shxyzp = stride[3] * shxyz;

		copy_from_buffer_5d(a, buf, nx, ny, nz, np, nq,
			shx, shxy, shxyz, shxyzp);
		return;
	}

	if (ndims == 6) {
		const int nx = msgdim[0] * fsize,
			ny = msgdim[1], nz = msgdim[2],
			np = msgdim[3], nq = msgdim[4], ns = msgdim[5];

		const int shx = stride[0] * fsize;
		const int shxy = stride[1] * shx;
		const int shxyz = stride[2] * shxy;
		const int shxyzp = stride[3] * shxyz;
		const int shxyzpq = stride[4] * shxyzp;

		copy_from_buffer_6d(a, buf, nx, ny, nz, np, nq, ns,
			shx, shxy, shxyz, shxyzp, shxyzpq);
		return;
	}
}
// ----------------------------------------------------------------------------------- //