diff --git a/ParLib.src/Makefile b/ParLib.src/Makefile
index 71c1bfb5dbc3f33c2c15df2b04000f2e15841d6e..c81ae8c1ba1da7017c8c85e3a3bc6badec10d390 100644
--- a/ParLib.src/Makefile
+++ b/ParLib.src/Makefile
@@ -1,81 +1,83 @@
-.SUFFIXES:      .c .o .a
-
-AR          = ar crl
-RANLIB      = ranlib
-
-INSTALLDIR	= $(HOME)/ParLib.v1.1
-#INSTALLDIR	= $(HOME)
-
-
-default: all
-all: setvars libparlib.a libparlibf.a
-
-#
-# For INM's HP-cluster
-#
-   CC 			= mpicc
-   LIBS			= -lmpich
-#   LIBPATH		= -L/opt/mpich-gm/lib
-#   INCPATH		= -I/opt/mpich-gm/include
-   DEFINES		= -DFORTRANUNDERSCORE
-
-setvars:
-
-libparlib.a: bexchange.o transpose.o
-	$(AR) $@ $?
-	$(RANLIB) $@
-
-libparlibf.a: parlibf.o bexchangef.o transposef.o
-	$(AR) $@ $?
-	$(RANLIB) $@
-
-.c.o:
-		$(CC) $(INCPATH) -c $(DEFINES) $<
-
-clean:
-		rm -f *.o *.a
-
-rebuild: clean all
-
-install: libparlib.a libparlibf.a
-	@if !(test -d $(INSTALLDIR)) ; then \
-	   echo Creating directory $(INSTALLDIR)...; \
-	   mkdir $(INSTALLDIR) ; \
-	fi
-	@if !(test -d $(INSTALLDIR)/lib) ; then \
-	   echo Creating directory $(INSTALLDIR)/lib...; \
-	   mkdir $(INSTALLDIR)/lib ; \
-	fi
-	cp $? $(INSTALLDIR)/lib
-	@if !(test -d $(INSTALLDIR)/include) ; then \
-	   echo Creating directory $(INSTALLDIR)/include...; \
-	   mkdir $(INSTALLDIR)/include ; \
-	fi
-	cp -r parlib.h parlibf.h $(INSTALLDIR)/include
-	@if !(test -d $(INSTALLDIR)/man) ; then \
-	   echo Creating directory $(INSTALLDIR)/man...; \
-	   mkdir $(INSTALLDIR)/man ; \
-	fi
-	@if !(test -d $(INSTALLDIR)/man/man3) ; then \
-	   echo Creating directory $(INSTALLDIR)/man/man3...; \
-	   mkdir $(INSTALLDIR)/man/man3 ; \
-	fi
-	cp -r man/* $(INSTALLDIR)/man/man3
-	ln -s $(INSTALLDIR)/man/man3/P_BExchange_init.3 \
-      $(INSTALLDIR)/man/man3/P_BExchange_start.3
-	ln -s $(INSTALLDIR)/man/man3/P_BExchange_init.3 \
-      $(INSTALLDIR)/man/man3/P_BExchange_end.3
-	ln -s $(INSTALLDIR)/man/man3/P_BExchange_init.3 \
-      $(INSTALLDIR)/man/man3/P_BExchange_free.3
-	ln -s $(INSTALLDIR)/man/man3/P_Transpose_init.3 \
-      $(INSTALLDIR)/man/man3/P_Transpose_start.3
-	ln -s $(INSTALLDIR)/man/man3/P_Transpose_init.3 \
-      $(INSTALLDIR)/man/man3/P_Transpose_end.3
-	ln -s $(INSTALLDIR)/man/man3/P_Transpose_init.3 \
-      $(INSTALLDIR)/man/man3/P_Transpose_free.3
-
-parlibf.o: parlib.h
-bexchange.o: parlib.h
-bexchangef.o: parlib.h
-transpose.o: parlib.h
-transposef.o: parlib.h
+.SUFFIXES:      .c .o .a
+
+AR          = ar crl
+RANLIB      = ranlib
+
+INSTALLDIR	= $(HOME)/ParLib.v2.1
+#INSTALLDIR	= $(HOME)
+
+
+default: all
+all: setvars libparlib.a libparlibf.a
+
+#
+# For INM's HP-cluster
+#
+   CC 			= mpicc -restrict -no-ansi-alias
+   LIBS			= -lmpich
+#   LIBPATH		= -L/opt/mpich-gm/lib
+#   INCPATH		= -I/opt/mpich-gm/include
+   DEFINES		= -DFORTRANUNDERSCORE
+
+setvars:
+
+libparlib.a: parlib.o plutils.o bexchange.o transpose.o
+	$(AR) $@ $?
+	$(RANLIB) $@
+
+libparlibf.a: parlibf.o bexchangef.o transposef.o
+	$(AR) $@ $?
+	$(RANLIB) $@
+
+.c.o:
+		$(CC) $(INCPATH) -c $(DEFINES) $<
+
+clean:
+		rm -f *.o *.a
+
+rebuild: clean all
+
+install: libparlib.a libparlibf.a
+	@if !(test -d $(INSTALLDIR)) ; then \
+	   echo Creating directory $(INSTALLDIR)...; \
+	   mkdir $(INSTALLDIR) ; \
+	fi
+	@if !(test -d $(INSTALLDIR)/lib) ; then \
+	   echo Creating directory $(INSTALLDIR)/lib...; \
+	   mkdir $(INSTALLDIR)/lib ; \
+	fi
+	cp $? $(INSTALLDIR)/lib
+	@if !(test -d $(INSTALLDIR)/include) ; then \
+	   echo Creating directory $(INSTALLDIR)/include...; \
+	   mkdir $(INSTALLDIR)/include ; \
+	fi
+	cp -r parlib.h parlibf.h $(INSTALLDIR)/include
+	@if !(test -d $(INSTALLDIR)/man) ; then \
+	   echo Creating directory $(INSTALLDIR)/man...; \
+	   mkdir $(INSTALLDIR)/man ; \
+	fi
+	@if !(test -d $(INSTALLDIR)/man/man3) ; then \
+	   echo Creating directory $(INSTALLDIR)/man/man3...; \
+	   mkdir $(INSTALLDIR)/man/man3 ; \
+	fi
+	cp -r man/* $(INSTALLDIR)/man/man3
+	ln -s $(INSTALLDIR)/man/man3/P_BExchange_init.3 \
+      $(INSTALLDIR)/man/man3/P_BExchange_start.3
+	ln -s $(INSTALLDIR)/man/man3/P_BExchange_init.3 \
+      $(INSTALLDIR)/man/man3/P_BExchange_end.3
+	ln -s $(INSTALLDIR)/man/man3/P_BExchange_init.3 \
+      $(INSTALLDIR)/man/man3/P_BExchange_free.3
+	ln -s $(INSTALLDIR)/man/man3/P_Transpose_init.3 \
+      $(INSTALLDIR)/man/man3/P_Transpose_start.3
+	ln -s $(INSTALLDIR)/man/man3/P_Transpose_init.3 \
+      $(INSTALLDIR)/man/man3/P_Transpose_end.3
+	ln -s $(INSTALLDIR)/man/man3/P_Transpose_init.3 \
+      $(INSTALLDIR)/man/man3/P_Transpose_free.3
+
+plutils.o: plutils.h
+parlib.o: parlib.h plutils.h
+parlibf.o: parlib.h plutils.h
+bexchange.o: parlib.h plutils.h
+bexchangef.o: parlib.h plutils.h
+transpose.o: parlib.h plutils.h
+transposef.o: parlib.h plutils.h
diff --git a/ParLib.src/bexchange.c b/ParLib.src/bexchange.c
index 4daf989adf5459c29d2503f4fcf51c10a258127a..52e66f75a38440bcb7a6ec47bd8068f205417308 100644
--- a/ParLib.src/bexchange.c
+++ b/ParLib.src/bexchange.c
@@ -1,209 +1,914 @@
-#include "parlib.h"
-
-/*  
- *  Error codes:
- *     0 - success
- *     1 - nonpositive number of dimensions
- *     2 - wrong communicated dimension
- *     3 - negative boundary width
- *     4 - nonpositive dimension
- *     5 - boundary width exceeds the array block length
- */
-int P_BExchange_init ( ndims, stride, blklen, bdim, overlap, datatype, 
-		comm, period, bexchange )
-   int				ndims, *stride, *blklen, bdim, overlap[2], period;
-   MPI_Datatype	datatype;
-   MPI_Comm			comm;
-	BExchange		*bexchange;
-{
-	int				nproc, iproc, direct, idim, sendproc[2], recvproc[2];
-	int				count, strd, sbind[2], rbind[2], send[2], recv[2];
-   MPI_Aint			fsize;
-   MPI_Datatype	oldtype, btype[2];
-   MPI_Request		sreq[2], rreq[2]; 
-
-/*
- * Check input parameters
- */
-   if ( ndims < 1 ) {return 1;}
-   if ( bdim < 1 || bdim > ndims ) {return 2;}
-   if ( overlap[0] == 0 && overlap[1] == 0 ) {return 0;} /* success */
-   for ( idim = 0; idim < ndims; idim++ ) {
-      if ( stride[idim] <= 0 ) {return 4;}
-   }
-   for ( direct = 0; direct < 2; direct++ ) {
-      if ( overlap[direct] < 0 ) {return 3;}
-      if ( overlap[direct] > blklen[bdim-1] ) {return 5;}
-   }
-/*
- * Define the number of processors in the group and the rank
- */
-   MPI_Comm_size ( comm, &nproc );
-   if ( nproc == 0 ) {return 0;} /* success */
-   MPI_Comm_rank ( comm, &iproc );
-   if ( iproc == MPI_UNDEFINED ) {return 0;} /* the process does not belong to the group */
-   sendproc[0] = ( iproc == 0 ? nproc-1 : iproc-1 );
-   recvproc[0] = ( iproc == nproc-1 ? 0 : iproc+1 );
-	sendproc[1] = recvproc[0];
-	recvproc[1] = sendproc[0];
-   send[0] = iproc > 0 || period; 
-   recv[0] = iproc < nproc-1 || period;
-   send[1] = recv[0];
-   recv[1] = send[0];
-   MPI_Type_extent ( datatype, &fsize );
-/*
- * Define data types for the boundaries
- */
-    for ( direct = 0; direct < 2; direct++ ) {
-      if ( overlap[direct] > 0 ) {
-         oldtype = datatype;
-         strd = 1;
-         for ( idim = 0; idim < ndims; idim++ ) {
-            if ( idim+1 == bdim ) {
-               count = overlap[direct];
-            } else {
-               count = blklen[idim];
-            }
-            MPI_Type_hvector ( count, 1, strd * fsize, oldtype, 
-					&btype[direct] );
-            if ( idim > 0 ) {
-               MPI_Type_free ( &oldtype );
-            }
-            oldtype = btype[direct];
-            strd = strd * stride[idim];
-         }
-         MPI_Type_commit ( &btype[direct] );
-      }
-   }
-/*
- * Determine the begining of boundaries
- */
-   strd = 1;
-   for ( idim = 0; idim < bdim - 1; idim++ ) {
-      strd = strd * stride[idim];
-   }
-   sbind[0] = 0;
-   rbind[0] = blklen[bdim-1]*strd;
-   sbind[1] = (blklen[bdim-1]-overlap[1])*strd;
-   rbind[1] = -overlap[1]*strd;
-
-   for ( direct = 0; direct < 2; direct++ ) {
-      bexchange->overlap[direct] = overlap[direct];
-      bexchange->send[direct] = send[direct];
-      bexchange->recv[direct] = recv[direct];
-      bexchange->btype[direct] = btype[direct];
-      bexchange->sendproc[direct] = sendproc[direct];
-      bexchange->recvproc[direct] = recvproc[direct];
-      bexchange->sbind[direct] = sbind[direct];
-      bexchange->rbind[direct] = rbind[direct];
-	}
-	bexchange->comm = comm;
-	bexchange->fsize = fsize;
-	return 0;
-}
-
-int P_BExchange_start ( a, bexchange )
-   void				*a; 
-	BExchange		*bexchange;
-{
-	int				direct, overlap[2], send[2], recv[2], btype[2]; 
-	int				sendproc[2], recvproc[2], sbind[2], rbind[2];
-	MPI_Comm			comm;
-   MPI_Request		sreq[2], rreq[2]; 
-   MPI_Aint			fsize;
-	char				*ach = (char *) a;
-
-   for ( direct = 0; direct < 2; direct++ ) {
-      overlap[direct] = bexchange->overlap[direct];
-      send[direct] = bexchange->send[direct];
-      recv[direct] = bexchange->recv[direct];
-      btype[direct] = bexchange->btype[direct];
-      sendproc[direct] = bexchange->sendproc[direct];
-      recvproc[direct] = bexchange->recvproc[direct];
-      sbind[direct] = bexchange->sbind[direct];
-      rbind[direct] = bexchange->rbind[direct];
-   }
-	comm = bexchange->comm;
-	fsize = bexchange->fsize;
-
-   for ( direct = 0; direct < 2; direct++ ) {
-      if ( overlap[direct] > 0 ) {
-         if ( send[direct] ) {
-            MPI_Isend ( ach+sbind[direct]*fsize, 1, btype[direct], 
-					sendproc[direct], 0, comm, &sreq[direct] );
-         }
-         if ( recv[direct] ) {
-            MPI_Irecv ( ach+rbind[direct]*fsize, 1, btype[direct], 
-					recvproc[direct], 0, comm, &rreq[direct] );
-         }
-      }
-   }
-   for ( direct = 0; direct < 2; direct++ ) {
-      bexchange->sreq[direct]=sreq[direct];
-      bexchange->rreq[direct]=rreq[direct];
-   }
-	return 0;
-}
-
-int P_BExchange_end ( bexchange ) 
-	BExchange    *bexchange;
-{
-   MPI_Status		status;
-   int 				direct, overlap[2], send[2], recv[2];
-   MPI_Request		sreq[2], rreq[2]; 
-
-   for ( direct = 0; direct < 2; direct++ ) {
-      overlap[direct] = bexchange->overlap[direct];
-      send[direct] = bexchange->send[direct];
-      recv[direct] = bexchange->recv[direct];
-      sreq[direct] = bexchange->sreq[direct];
-      rreq[direct] = bexchange->rreq[direct];
-   }
-
-   for ( direct = 0; direct < 2; direct++ ) {
-      if ( overlap[direct] > 0 ) {
-         if ( send[direct] ) {
-            MPI_Wait ( &sreq[direct], &status );
-         }
-         if ( recv[direct] ) {
-            MPI_Wait ( &rreq[direct], &status );
-         }
-      }
-   }
-	return 0;
-}
-
-int P_BExchange_free ( bexchange ) 
-	BExchange    *bexchange;
-{
-   int direct, overlap[2];
-   MPI_Datatype btype[2];
-   for ( direct = 0; direct < 2; direct++ ) {
-      overlap[direct] = bexchange->overlap[direct];
-      btype[direct] = bexchange->btype[direct];
-	}
-   for ( direct = 0; direct < 2; direct++ ) {
-      if ( overlap[direct] > 0 ) {
-         MPI_Type_free ( &btype[direct] );
-      }
-   }
-	return 0;
-}
-
-int P_BExchange ( a, ndims, stride, blklen, bdim, overlap, datatype,
-		comm, period )
-   void *a; 
-   MPI_Datatype datatype;
-   int ndims, *stride, *blklen, bdim, overlap[2]; 
-   int period;
-   MPI_Comm comm;
-{
-	BExchange bexchange;
-	int ierr;
-   if ( ierr = P_BExchange_init ( ndims, stride, blklen, bdim, overlap, 
-		datatype, comm, period, &bexchange ) != 0 ) { return ierr; }
-   P_BExchange_start ( a, &bexchange );
-   P_BExchange_end ( &bexchange );
-   P_BExchange_free ( &bexchange );
-   return 0;
-}
+#include "parlib.h"
+#include "plutils.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+/*  
+ *  Error codes:
+ *     0 - success
+ *     1 - nonpositive number of dimensions
+ *     2 - wrong communicated dimension
+ *     3 - negative boundary width
+ *     4 - nonpositive dimension
+ *     5 - boundary width exceeds the array block length
+ *     6 - number of dimensions exceeds maximum value (only for MP - manual packing)
+ *     999 - incorrect exchange mode (only for generic calls)
+ */
+
+int P_BExchange_init ( ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, bexchange )
+	int ndims, *stride, *blklen, bdim, overlap[2];
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	BExchange *bexchange;
+{
+	int nproc, iproc, direct, idim, sendproc[2], recvproc[2];
+	int count, strd, sbind[2], rbind[2], send[2], recv[2];
+	MPI_Aint fsize, lb;
+	MPI_Datatype oldtype, btype[2];
+
+
+// Setting degenerate-success cases conditions for consistency
+//	including: overlap[]=0, nproc=0, iproc=MPI_UNDEFINED
+	for (direct = 0; direct < 2; direct++) {
+		bexchange->overlap[direct] = 0;
+	}
+
+//
+// Check input parameters
+//
+	if (ndims < 1) { return 1; }
+	if (bdim < 1 || bdim > ndims) { return 2; }
+	if (overlap[0] == 0 && overlap[1] == 0) { return 0; } /* success */
+	for (idim = 0; idim < ndims; idim++) {
+		if (stride[idim] <= 0) { return 4; }
+	}
+	for (direct = 0; direct < 2; direct++) {
+		if (overlap[direct] < 0) { return 3; }
+		if (overlap[direct] > blklen[bdim - 1]) { return 5; }
+	}
+
+//
+// Define the number of processors in the group and the rank
+//
+	if (comm == MPI_COMM_NULL) { return 0; } // empty communicator
+
+	MPI_Comm_size(comm, &nproc);
+	if (nproc == 0) { return 0; } /* success */
+	MPI_Comm_rank(comm, &iproc);
+	if (iproc == MPI_UNDEFINED) { return 0; } /* the process does not belong to the group */
+	sendproc[0] = (iproc == 0 ? nproc - 1 : iproc - 1);
+	recvproc[0] = (iproc == nproc - 1 ? 0 : iproc + 1);
+	sendproc[1] = recvproc[0];
+	recvproc[1] = sendproc[0];
+	send[0] = iproc > 0 || period;
+	recv[0] = iproc < nproc - 1 || period;
+	send[1] = recv[0];
+	recv[1] = send[0];
+	MPI_Type_get_extent(datatype, &lb, &fsize);
+//
+// Define data types for the boundaries
+//
+	// checking if one data type will suffice ...
+	const int ndsize = (overlap[0] == overlap[1]) ? 1 : 2;
+
+	for (direct = 0; direct < ndsize; direct++) {
+		if (overlap[direct] > 0) {
+
+			if (bdim == 1) {
+				count = overlap[direct];
+			}
+			else {
+				count = blklen[0];
+			}
+			MPI_Type_contiguous(count, datatype, &btype[direct]);
+			oldtype = btype[direct];
+			strd = stride[0];
+
+			for (idim = 1; idim < ndims; idim++) {
+				if (bdim == idim + 1) {
+					count = overlap[direct];
+				}
+				else {
+					count = blklen[idim];
+				}
+				MPI_Type_create_hvector(count, 1, strd * fsize, oldtype,
+					&btype[direct]);
+
+				MPI_Type_free(&oldtype);
+				oldtype = btype[direct];
+				strd = strd * stride[idim];
+			}
+			MPI_Type_commit(&btype[direct]);
+		}
+	}
+	if (ndsize == 1) btype[1] = btype[0];	// using same MPI-datatype
+
+
+//
+// Determine the begining of boundaries
+//
+	strd = 1;
+	for (idim = 0; idim < bdim - 1; idim++) {
+		strd = strd * stride[idim];
+	}
+	sbind[0] = 0;
+	rbind[0] = blklen[bdim - 1] * strd;
+	sbind[1] = (blklen[bdim - 1] - overlap[1])*strd;
+	rbind[1] = -overlap[1] * strd;
+
+	for (direct = 0; direct < 2; direct++) {
+		bexchange->overlap[direct] = overlap[direct];
+		bexchange->send[direct] = send[direct];
+		bexchange->recv[direct] = recv[direct];
+		bexchange->btype[direct] = btype[direct];
+		bexchange->sendproc[direct] = sendproc[direct];
+		bexchange->recvproc[direct] = recvproc[direct];
+		bexchange->sbind[direct] = sbind[direct];
+		bexchange->rbind[direct] = rbind[direct];
+	}
+	bexchange->comm = comm;
+	bexchange->fsize = fsize;
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_start ( a, bexchange )
+	void *a;
+	BExchange *bexchange;
+{
+	int direct;
+	char *ach = (char *) a;
+
+	for (direct = 0; direct < 2; direct++) {
+
+		if (bexchange->overlap[direct] > 0) {
+			if (bexchange->send[direct]) {
+				MPI_Isend(ach + bexchange->sbind[direct] * bexchange->fsize, 1, bexchange->btype[direct],
+					bexchange->sendproc[direct], 0, bexchange->comm, 
+					&bexchange->req[direct]);
+			}
+			else
+			{
+				bexchange->req[direct] = MPI_REQUEST_NULL;
+			}
+			if (bexchange->recv[direct]) {
+				MPI_Irecv(ach + bexchange->rbind[direct] * bexchange->fsize, 1, bexchange->btype[direct],
+					bexchange->recvproc[direct], 0, bexchange->comm, 
+					&bexchange->req[2 + direct]);
+			}
+			else
+			{
+				bexchange->req[2 + direct] = MPI_REQUEST_NULL;
+			}
+		}
+		else
+		{
+			bexchange->req[direct] = MPI_REQUEST_NULL;
+			bexchange->req[2 + direct] = MPI_REQUEST_NULL;
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_end ( bexchange ) 
+	BExchange *bexchange;
+{
+	MPI_Status status[4];
+
+	MPI_Waitall(4, bexchange->req, status);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_free ( bexchange ) 
+	BExchange *bexchange;
+{
+	int direct;
+
+	// checking if one data type sufficed at init ...
+	const int ndsize = (bexchange->overlap[0] == bexchange->overlap[1]) ? 1 : 2;
+
+	for (direct = 0; direct < ndsize; direct++) {
+		if (bexchange->overlap[direct] > 0) {
+			MPI_Type_free(&bexchange->btype[direct]);
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+	comm, period )
+	
+	void *a;
+	int ndims, *stride, *blklen, bdim, overlap[2];
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+{
+	BExchange bexchange;
+	int ierr;
+	if (ierr = P_BExchange_init(ndims, stride, blklen, bdim, overlap,
+		datatype, comm, period, &bexchange) != 0) {
+		return ierr;
+	}
+	P_BExchange_start(a, &bexchange);
+	P_BExchange_end(&bexchange);
+	P_BExchange_free(&bexchange);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+
+// v.1.3 - persistent exchanges //
+// -------------------------------------------------------------------------- //
+int PST_BExchange_init ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+	comm, period, bexchange )
+	
+	void *a;
+	int ndims, *stride, *blklen, bdim, overlap[2];
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	BExchange *bexchange;
+{
+	int direct;
+	char *ach = (char *)a;
+
+	int ierr = P_BExchange_init(ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, bexchange);
+	if (ierr != 0) return ierr;
+
+	for (direct = 0; direct < 2; direct++) {
+		if (bexchange->overlap[direct] > 0) {
+			if (bexchange->send[direct]) {
+				MPI_Send_init(ach + bexchange->sbind[direct] * bexchange->fsize, 1, bexchange->btype[direct],
+					bexchange->sendproc[direct], 0, bexchange->comm, 
+					&bexchange->req[direct]);
+			}
+			if (bexchange->recv[direct]) {
+				MPI_Recv_init(ach + bexchange->rbind[direct] * bexchange->fsize, 1, bexchange->btype[direct],
+					bexchange->recvproc[direct], 0, bexchange->comm, 
+					&bexchange->req[2 + direct]);
+			}
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_BExchange_start ( bexchange )
+	BExchange *bexchange;
+{
+	int direct;
+	
+	for (direct = 0; direct < 2; direct++) {
+		if (bexchange->overlap[direct] > 0) {
+			
+			if (bexchange->send[direct]) {
+				MPI_Start(&bexchange->req[direct]);
+			}
+			else
+			{
+				bexchange->req[direct] = MPI_REQUEST_NULL;
+			}
+
+			if (bexchange->recv[direct]) {
+				MPI_Start(&bexchange->req[2 + direct]);
+			}
+			else
+			{
+				bexchange->req[2 + direct] = MPI_REQUEST_NULL;
+			}
+		}
+		else
+		{
+			bexchange->req[direct] = MPI_REQUEST_NULL;
+			bexchange->req[2 + direct] = MPI_REQUEST_NULL;
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_BExchange_end(bexchange)
+	BExchange *bexchange;
+{
+	return P_BExchange_end(bexchange);
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_BExchange_free(bexchange)
+	BExchange *bexchange;
+{
+	int direct;
+
+	P_BExchange_free(bexchange);
+
+	for (direct = 0; direct < 2; direct++) {
+		if (bexchange->overlap[direct] > 0) {
+			if (bexchange->send[direct]) {
+				MPI_Request_free(&bexchange->req[direct]);
+			}
+			if (bexchange->recv[direct]) {
+				MPI_Request_free(&bexchange->req[2 + direct]);
+			}
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_BExchange( a, ndims, stride, blklen, bdim, overlap, datatype, 
+	comm, period )
+
+	void *a;
+	int ndims, *stride, *blklen, bdim, overlap[2];
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+{
+	BExchange bexchange;
+	int ierr;
+	if (ierr = PST_BExchange_init(a, ndims, stride, blklen, bdim, overlap,
+		datatype, comm, period, &bexchange) != 0) {
+		return ierr;
+	}
+	PST_BExchange_start(&bexchange);
+	PST_BExchange_end(&bexchange);
+	PST_BExchange_free(&bexchange);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// v.1.4 - manual packing //
+// -------------------------------------------------------------------------- //
+int P_BExchange_mp_init ( ndims, stride, blklen, bdim, overlap, datatype, 
+	comm, period, bexchange )
+	
+	int ndims, *stride, *blklen, bdim, overlap[2];
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	BExchange *bexchange;
+{
+	int nproc, iproc, direct, idim, sendproc[2], recvproc[2];
+	int count, strd, sbind[2], rbind[2], send[2], recv[2];
+	void *sbuf[2], *rbuf[2];
+	int buf_id[4];
+	int mdims[2][MAX_PARLIB_MP_DIMS];
+	int msize[2];
+	MPI_Aint lb, fsize;
+	MPI_Datatype btype[2];
+
+// Setting degenerate-success cases conditions for consistency
+//	including: overlap[]=0, nproc=0, iproc=MPI_UNDEFINED
+	for (direct = 0; direct < 2; direct++) {
+		bexchange->overlap[direct] = 0;
+	}
+
+//
+// Check input parameters
+//
+	if (ndims < 1) { return 1; }
+	if (bdim < 1 || bdim > ndims) { return 2; }
+	if (overlap[0] == 0 && overlap[1] == 0) { return 0; } /* success */
+	for (idim = 0; idim < ndims; idim++) {
+		if (stride[idim] <= 0) { return 4; }
+	}
+	for (direct = 0; direct < 2; direct++) {
+		if (overlap[direct] < 0) { return 3; }
+		if (overlap[direct] > blklen[bdim - 1]) { return 5; }
+	}
+	if (ndims > MAX_PARLIB_MP_DIMS) { return 6; }
+//
+// Define the number of processors in the group and the rank
+//
+	if (comm == MPI_COMM_NULL) { return 0; } // empty communicator
+
+	MPI_Comm_size(comm, &nproc);
+	if (nproc == 0) { return 0; } /* success */
+	MPI_Comm_rank(comm, &iproc);
+	if (iproc == MPI_UNDEFINED) { return 0; } /* the process does not belong to the group */
+	sendproc[0] = (iproc == 0 ? nproc - 1 : iproc - 1);
+	recvproc[0] = (iproc == nproc - 1 ? 0 : iproc + 1);
+	sendproc[1] = recvproc[0];
+	recvproc[1] = sendproc[0];
+	send[0] = iproc > 0 || period;
+	recv[0] = iproc < nproc - 1 || period;
+	send[1] = recv[0];
+	recv[1] = send[0];
+	MPI_Type_get_extent(datatype, &lb, &fsize);
+//
+// Define data type, message sizes and buffers for the boundaries
+//
+
+	for (direct = 0; direct < 2; direct++) {
+		if (overlap[direct] > 0) {
+
+			msize[direct] = 1;
+			btype[direct] = datatype;
+
+			for (idim = 0; idim < ndims; idim++) {
+				if (bdim == idim + 1) {
+					count = overlap[direct];
+				}
+				else {
+					count = blklen[idim];
+				}
+				mdims[direct][idim] = count;
+				msize[direct] *= count;
+			}
+
+			if (send[direct]) {
+				sbuf[direct] = get_plbuf(msize[direct] * fsize * sizeof(char),
+					&buf_id[direct]);
+			}
+			else
+			{
+				sbuf[direct] = NULL;
+				buf_id[direct] = -1;
+			}
+
+			if (recv[direct]) {
+				rbuf[direct] = get_plbuf(msize[direct] * fsize * sizeof(char),
+					&buf_id[2 + direct]);
+			}
+			else
+			{
+				rbuf[direct] = NULL;
+				buf_id[2 + direct] = -1;
+			}
+		}
+		else
+		{
+			msize[direct] = 0;
+			for (idim = 0; idim < ndims; idim++) {
+				mdims[direct][idim] = 0;
+			}
+
+			sbuf[direct] = NULL;
+			rbuf[direct] = NULL;
+
+			buf_id[direct] = -1;
+			buf_id[2 + direct] = -1;
+		}
+	}
+//
+// Determine the begining of boundaries
+//
+	strd = 1;
+	for (idim = 0; idim < bdim - 1; idim++) {
+		strd = strd * stride[idim];
+	}
+	sbind[0] = 0;
+	rbind[0] = blklen[bdim - 1] * strd;
+	sbind[1] = (blklen[bdim - 1] - overlap[1])*strd;
+	rbind[1] = -overlap[1] * strd;
+
+	for (direct = 0; direct < 2; direct++) {
+		bexchange->overlap[direct] = overlap[direct];
+		bexchange->send[direct] = send[direct];
+		bexchange->recv[direct] = recv[direct];
+		bexchange->btype[direct] = btype[direct];
+		bexchange->sendproc[direct] = sendproc[direct];
+		bexchange->recvproc[direct] = recvproc[direct];
+		bexchange->sbind[direct] = sbind[direct];
+		bexchange->rbind[direct] = rbind[direct];
+
+		bexchange->sbuf[direct] = sbuf[direct];
+		bexchange->rbuf[direct] = rbuf[direct];
+		bexchange->buf_id[direct] = buf_id[direct];
+		bexchange->buf_id[2 + direct] = buf_id[2 + direct];
+
+		memcpy(bexchange->mdims[direct], mdims[direct], ndims * sizeof(int));
+		bexchange->msize[direct] = msize[direct];
+	}
+	bexchange->comm = comm;
+	bexchange->fsize = fsize;
+
+	bexchange->ndims = ndims;
+	memcpy(bexchange->stride, stride, ndims * sizeof(int));
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_mp_start( a, bexchange )
+	void *a;
+	BExchange *bexchange;
+{
+	int direct;
+	char *ach = (char *)a;
+
+	for (direct = 0; direct < 2; direct++) {
+
+		if (bexchange->overlap[direct] > 0) {
+			if (bexchange->recv[direct]) {
+				MPI_Irecv(bexchange->rbuf[direct], bexchange->msize[direct], bexchange->btype[direct],
+					bexchange->recvproc[direct], 0, bexchange->comm,
+					&bexchange->req[2 + direct]);
+			}
+			else
+			{
+				bexchange->req[2 + direct] = MPI_REQUEST_NULL;
+			}
+			if (bexchange->send[direct]) {
+				copy_to_buffer(
+					(char*)bexchange->sbuf[direct],
+					ach + bexchange->sbind[direct] * bexchange->fsize,
+					bexchange->ndims, bexchange->mdims[direct], bexchange->stride, bexchange->fsize);
+
+				MPI_Isend(bexchange->sbuf[direct], bexchange->msize[direct], bexchange->btype[direct],
+					bexchange->sendproc[direct], 0, bexchange->comm, 
+					&bexchange->req[direct]);
+			}
+			else
+			{
+				bexchange->req[direct] = MPI_REQUEST_NULL;
+			}
+		}
+		else
+		{
+			bexchange->req[direct] = MPI_REQUEST_NULL;
+			bexchange->req[2 + direct] = MPI_REQUEST_NULL;
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_mp_end ( a, bexchange ) 
+	void *a;
+	BExchange *bexchange;
+{
+	MPI_Status status[4];
+	
+	int direct;
+	char *ach = (char *)a;
+
+	MPI_Waitall(2, &bexchange->req[2], &status[2]);
+	for (direct = 0; direct < 2; direct++) {
+		if (bexchange->overlap[direct] > 0) {
+			if (bexchange->recv[direct]) {
+				copy_from_buffer(ach + bexchange->rbind[direct] * bexchange->fsize,
+					(char*)bexchange->rbuf[direct],
+					bexchange->ndims, bexchange->mdims[direct], bexchange->stride, bexchange->fsize);
+			}
+		}
+	}
+
+	MPI_Waitall(2, bexchange->req, status);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_mp_free ( bexchange ) 
+	BExchange *bexchange;
+{
+	int direct;
+
+	for (direct = 0; direct < 2; direct++) {
+		if (bexchange->overlap[direct] > 0) {
+			if (bexchange->send[direct]) {
+				free_plbuf(bexchange->sbuf[direct],
+					bexchange->buf_id[direct]);
+			}
+			if (bexchange->recv[direct]) {
+				free_plbuf(bexchange->rbuf[direct],
+					bexchange->buf_id[2 + direct]);
+			}
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_mp(a, ndims, stride, blklen, bdim, overlap, datatype,
+	comm, period)
+	
+	void *a;
+	int ndims, *stride, *blklen, bdim, overlap[2];
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+{
+	BExchange bexchange;
+	int ierr;
+	if (ierr = P_BExchange_mp_init(ndims, stride, blklen, bdim, overlap,
+		datatype, comm, period, &bexchange) != 0) {
+		return ierr;
+	}
+	P_BExchange_mp_start(a, &bexchange);
+	P_BExchange_mp_end(a, &bexchange);
+	P_BExchange_mp_free(&bexchange);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+
+// v.1.5 - persistent exchanges for manual packing //
+// -------------------------------------------------------------------------- //
+int PST_BExchange_mp_init(ndims, stride, blklen, bdim, overlap, datatype,
+	comm, period, bexchange)
+	
+	int ndims, *stride, *blklen, bdim, overlap[2];
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	BExchange *bexchange;
+{
+	int direct;
+	
+	int ierr = P_BExchange_mp_init(ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, bexchange);
+	if (ierr != 0) return ierr;
+
+	for (direct = 0; direct < 2; direct++) {
+		if (bexchange->overlap[direct] > 0) {
+			if (bexchange->send[direct]) {
+				MPI_Send_init(bexchange->sbuf[direct], bexchange->msize[direct], bexchange->btype[direct],
+					bexchange->sendproc[direct], 0, bexchange->comm, 
+					&bexchange->req[direct]);
+			}
+			if (bexchange->recv[direct]) {
+				MPI_Recv_init(bexchange->rbuf[direct], bexchange->msize[direct], bexchange->btype[direct],
+					bexchange->recvproc[direct], 0, bexchange->comm, 
+					&bexchange->req[2 + direct]);
+			}
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_BExchange_mp_start( a, bexchange )
+	void *a;
+	BExchange *bexchange;
+{
+	int direct;
+	char *ach = (char *)a;
+	
+	for (direct = 0; direct < 2; direct++) {
+		if (bexchange->overlap[direct] > 0) {
+			
+			if (bexchange->recv[direct]) {
+				MPI_Start(&bexchange->req[2 + direct]);
+			}
+			else
+				bexchange->req[2 + direct] = MPI_REQUEST_NULL;
+
+			if (bexchange->send[direct]) {
+				copy_to_buffer((char*)bexchange->sbuf[direct],
+					ach + bexchange->sbind[direct] * bexchange->fsize,
+					bexchange->ndims, bexchange->mdims[direct], bexchange->stride, bexchange->fsize);
+
+				MPI_Start(&bexchange->req[direct]);
+			}
+			else
+				bexchange->req[direct] = MPI_REQUEST_NULL;
+		}
+		else
+		{
+			bexchange->req[direct] = MPI_REQUEST_NULL;
+			bexchange->req[2 + direct] = MPI_REQUEST_NULL;
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_BExchange_mp_end( a, bexchange )
+	void* a;
+	BExchange *bexchange;
+{
+	return P_BExchange_mp_end(a, bexchange);
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_BExchange_mp_free( bexchange )
+	BExchange *bexchange;
+{
+	int direct;
+
+	P_BExchange_mp_free(bexchange);
+
+	for (direct = 0; direct < 2; direct++) {
+		if (bexchange->overlap[direct] > 0) {
+			if (bexchange->send[direct]) {
+				MPI_Request_free(&bexchange->req[direct]);
+			}
+			if (bexchange->recv[direct]) {
+				MPI_Request_free(&bexchange->req[2 + direct]);
+			}
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_BExchange_mp( a, ndims, stride, blklen, bdim, overlap, datatype,
+	comm, period )
+	
+	void *a;
+	MPI_Datatype datatype;
+	int ndims, *stride, *blklen, bdim, overlap[2];
+	int period;
+	MPI_Comm comm;
+{
+	BExchange bexchange;
+	int ierr;
+	if (ierr = PST_BExchange_mp_init(ndims, stride, blklen, bdim, overlap,
+		datatype, comm, period, &bexchange) != 0) {
+		return ierr;
+	}
+	PST_BExchange_mp_start(a, &bexchange);
+	PST_BExchange_mp_end(a, &bexchange);
+	PST_BExchange_mp_free(&bexchange);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// v.1.95 - choice subroutines //
+// -------------------------------------------------------------------------- //
+int P_BExchange_opt_init ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+	comm, period, bexchange, exch_mode )
+	
+	void *a;
+	int ndims, *stride, *blklen, bdim, overlap[2];
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	BExchange *bexchange;
+	int exch_mode;
+{
+	if (exch_mode == IS_MPI_TYPED) {
+		return P_BExchange_init(ndims, stride, blklen, bdim, overlap, datatype,
+			comm, period, bexchange);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK) {
+		return P_BExchange_mp_init(ndims, stride, blklen, bdim, overlap, datatype,
+			comm, period, bexchange);
+	}
+	if (exch_mode == IS_MPI_TYPED_PERSISTENT) {
+		return PST_BExchange_init(a, ndims, stride, blklen, bdim, overlap, datatype,
+			comm, period, bexchange);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK_PERSISTENT) {
+		return PST_BExchange_mp_init(ndims, stride, blklen, bdim, overlap, datatype,
+			comm, period, bexchange);
+	}
+
+	return 999;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_opt_start ( a, bexchange, exch_mode )
+	void *a;
+	BExchange *bexchange;
+	int exch_mode;
+{
+	if (exch_mode == IS_MPI_TYPED) {
+		return P_BExchange_start(a, bexchange);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK) {
+		return P_BExchange_mp_start(a, bexchange);
+	}
+	if (exch_mode == IS_MPI_TYPED_PERSISTENT) {
+		return PST_BExchange_start(bexchange);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK_PERSISTENT) {
+		return PST_BExchange_mp_start(a, bexchange);
+	}
+
+	return 999;
+}
+// -------------------------------------------------------------------------- //
+
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_opt_end ( a, bexchange, exch_mode ) 
+	void *a;
+	BExchange *bexchange;
+	int exch_mode;
+{
+	if (exch_mode == IS_MPI_TYPED) {
+		return P_BExchange_end(bexchange);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK) {
+		return P_BExchange_mp_end(a, bexchange);
+	}
+	if (exch_mode == IS_MPI_TYPED_PERSISTENT) {
+		return PST_BExchange_end(bexchange);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK_PERSISTENT) {
+		return PST_BExchange_mp_end(a, bexchange);
+	}
+
+	return 999;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_opt_free ( bexchange, exch_mode ) 
+	BExchange *bexchange;
+	int exch_mode;
+{
+	if (exch_mode == IS_MPI_TYPED) {
+		return P_BExchange_free(bexchange);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK) {
+		return P_BExchange_mp_free(bexchange);
+	}
+	if (exch_mode == IS_MPI_TYPED_PERSISTENT) {
+		return PST_BExchange_free(bexchange);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK_PERSISTENT) {
+		return PST_BExchange_mp_free(bexchange);
+	}
+
+	return 999;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_opt ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+	comm, period, exch_mode )
+	
+	void *a;
+	int ndims, *stride, *blklen, bdim, overlap[2];
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	int exch_mode;
+{
+	BExchange bexchange;
+	int ierr;
+
+	if (exch_mode == IS_MPI_TYPED) {
+		if (ierr = P_BExchange_init(ndims, stride, blklen, bdim, overlap, datatype,
+			comm, period, &bexchange) != 0) {
+			return ierr;
+		}
+
+		P_BExchange_start(a, &bexchange);
+		P_BExchange_end(&bexchange);
+		P_BExchange_free(&bexchange);
+		return 0;
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK) {
+		if (ierr = P_BExchange_mp_init(ndims, stride, blklen, bdim, overlap, datatype,
+			comm, period, &bexchange) != 0) {
+			return ierr;
+		}
+
+		P_BExchange_mp_start(a, &bexchange);
+		P_BExchange_mp_end(a, &bexchange);
+		P_BExchange_mp_free(&bexchange);
+		return 0;
+	}
+	if (exch_mode == IS_MPI_TYPED_PERSISTENT) {
+		if (ierr = PST_BExchange_init(a, ndims, stride, blklen, bdim, overlap, datatype,
+			comm, period, &bexchange) != 0) {
+			return ierr;
+		}
+
+		PST_BExchange_start(&bexchange);
+		PST_BExchange_end(&bexchange);
+		PST_BExchange_free(&bexchange);
+		return 0;
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK_PERSISTENT) {
+		if (ierr = PST_BExchange_mp_init(ndims, stride, blklen, bdim, overlap, datatype,
+			comm, period, &bexchange) != 0) {
+			return ierr;
+		}
+		PST_BExchange_mp_start(a, &bexchange);
+		PST_BExchange_mp_end(a, &bexchange);
+		PST_BExchange_mp_free(&bexchange);
+		return 0;
+	}
+
+	return 999;
+}
+// -------------------------------------------------------------------------- //
\ No newline at end of file
diff --git a/ParLib.src/bexchangef.c b/ParLib.src/bexchangef.c
index 5acbf5e24914b4f0a56e837ce54e09219fa10918..9129931bd53566c30bc68fb2175bec98a690e8c3 100644
--- a/ParLib.src/bexchangef.c
+++ b/ParLib.src/bexchangef.c
@@ -1,6 +1,7 @@
 #include <stdlib.h>
 #include "parlib.h"
 
+// -------------------------------------------------------------------------- //
 #ifdef FORTRANUNDERSCORE
 void p_bexchange_init_ ( ndims, stride, blklen, bdim, overlap, datatype, 
 		comm, period, bexchange, ierr )
@@ -12,14 +13,14 @@ void p_bexchange_init ( ndims, stride, blklen, bdim, overlap, datatype,
 		comm, period, bexchange, ierr )
 #endif
 
-   MPI_Fint *datatype, *ndims, *stride, *blklen, *bdim, *overlap, 
-		*period, *ierr, *comm; 
+	MPI_Fint *datatype, *ndims, *stride, *blklen, *bdim, *overlap,
+		*period, *ierr, *comm;
 	BExchange **bexchange;
 {
-   *bexchange = (BExchange *) malloc ( sizeof (BExchange) );
-   *ierr = P_BExchange_init ( (int)*ndims, (int *)stride, (int *)blklen,
-	(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype), 
-	MPI_Comm_f2c(*comm), (int)*period, *bexchange );
+	*bexchange = (BExchange *) malloc ( sizeof (BExchange) );
+	*ierr = P_BExchange_init ( (int)*ndims, (int *)stride, (int *)blklen,
+		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype), 
+		MPI_Comm_f2c(*comm), (int)*period, *bexchange );
 }
 
 #ifdef FORTRANUNDERSCORE
@@ -30,11 +31,11 @@ void p_bexchange_start__ ( a, bexchange, ierr )
 void p_bexchange_start ( a, bexchange, ierr )
 #endif
 
-   void *a;
-   BExchange **bexchange;
+	void *a;
+	BExchange **bexchange;
 	MPI_Fint *ierr;
 {
-   *ierr = P_BExchange_start ( a, *bexchange );
+	*ierr = P_BExchange_start(a, *bexchange);
 }
 
 #ifdef FORTRANUNDERSCORE
@@ -45,10 +46,10 @@ void p_bexchange_end__ ( bexchange, ierr )
 void p_bexchange_end ( bexchange, ierr )
 #endif
 
-   BExchange **bexchange;
-   MPI_Fint *ierr;
+	BExchange **bexchange;
+	MPI_Fint *ierr;
 {
-   *ierr = P_BExchange_end ( *bexchange );
+	*ierr = P_BExchange_end(*bexchange);
 }
 
 #ifdef FORTRANUNDERSCORE
@@ -58,11 +59,12 @@ void p_bexchange_free__ ( bexchange, ierr )
 #else
 void p_bexchange_free ( bexchange, ierr )
 #endif
-   BExchange **bexchange;
-   MPI_Fint *ierr;
+
+	BExchange **bexchange;
+	MPI_Fint *ierr;
 {
-   P_BExchange_free ( *bexchange );
-   free ( *bexchange );
+	P_BExchange_free(*bexchange);
+	free(*bexchange);
 }
 
 #ifdef FORTRANUNDERSCORE
@@ -75,12 +77,487 @@ void p_bexchange__ ( a, ndims, stride, blklen, bdim, overlap, datatype,
 void p_bexchange ( a, ndims, stride, blklen, bdim, overlap, datatype,
 		comm, period, ierr )
 #endif
+
+	void *a;
+	MPI_Fint *ndims, *stride, *blklen, *bdim, *overlap, *period, *ierr;
+	MPI_Fint *datatype;
+	MPI_Fint *comm;
+{
+	*ierr = P_BExchange(a, (int)*ndims, (int *)stride, (int *)blklen,
+		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype),
+		MPI_Comm_f2c(*comm), (int)*period);
+}
+// -------------------------------------------------------------------------- //
+
+// v.1.3 - persistent exchanges //
+// -------------------------------------------------------------------------- //
+#ifdef FORTRANUNDERSCORE
+void pst_bexchange_init_ ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_bexchange_init__ ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, bexchange, ierr )
+#else
+void pst_bexchange_init(a, ndims, stride, blklen, bdim, overlap, datatype,
+	    comm, period, bexchange, ierr)
+#endif
+
+	void *a;
+	MPI_Fint *datatype, *ndims, *stride, *blklen, *bdim, *overlap, 
+		*period, *ierr, *comm; 
+	BExchange **bexchange;
+{
+	*bexchange = (BExchange *) malloc ( sizeof (BExchange) );
+	*ierr = PST_BExchange_init ( a, (int)*ndims, (int *)stride, (int *)blklen,
+		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype), 
+		MPI_Comm_f2c(*comm), (int)*period, *bexchange );
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_bexchange_start_ ( bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_bexchange_start__ ( bexchange, ierr )
+#else
+void pst_bexchange_start ( bexchange, ierr )
+#endif
+
+	BExchange **bexchange;
+	MPI_Fint *ierr;
+{
+	*ierr = PST_BExchange_start(*bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_bexchange_end_ ( bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_bexchange_end__ ( bexchange, ierr )
+#else
+void pst_bexchange_end ( bexchange, ierr )
+#endif
+
+	BExchange **bexchange;
+	MPI_Fint *ierr;
+{
+	*ierr = PST_BExchange_end(*bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_bexchange_free_ ( bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_bexchange_free__ ( bexchange, ierr )
+#else
+void pst_bexchange_free ( bexchange, ierr )
+#endif
+
+	BExchange **bexchange;
+	MPI_Fint *ierr;
+{
+	PST_BExchange_free(*bexchange);
+	free(*bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_bexchange_ ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_bexchange__ ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, ierr )
+#else
+void pst_bexchange ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, ierr )
+#endif
+	
+	void *a;
+	MPI_Fint *ndims, *stride, *blklen, *bdim, *overlap, *period, *ierr;
+	MPI_Fint *datatype;
+	MPI_Fint *comm;
+{
+	*ierr = PST_BExchange(a, (int)*ndims, (int *)stride, (int *)blklen,
+		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype),
+		MPI_Comm_f2c(*comm), (int)*period);
+}
+// -------------------------------------------------------------------------- //
+
+// v.1.4 - manual packing //
+// -------------------------------------------------------------------------- //
+#ifdef FORTRANUNDERSCORE
+void p_bexchange_mp_init_ ( ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_bexchange_mp_init__ ( ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, bexchange, ierr )
+#else
+void p_bexchange_mp_init ( ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, bexchange, ierr )
+#endif
+
+	MPI_Fint *datatype, *ndims, *stride, *blklen, *bdim, *overlap,
+		*period, *ierr, *comm;
+	BExchange **bexchange;
+{
+	*bexchange = (BExchange *)malloc(sizeof (BExchange));
+	*ierr = P_BExchange_mp_init((int)*ndims, (int *)stride, (int *)blklen,
+		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype),
+		MPI_Comm_f2c(*comm), (int)*period, *bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_bexchange_mp_start_ ( a, bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_bexchange_mp_start__ ( a, bexchange, ierr )
+#else
+void p_bexchange_mp_start ( a, bexchange, ierr )
+#endif
+
+	void *a;
+	BExchange **bexchange;
+	MPI_Fint *ierr;
+{
+	*ierr = P_BExchange_mp_start(a, *bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_bexchange_mp_end_ ( a, bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_bexchange_mp_end__ ( a, bexchange, ierr )
+#else
+void p_bexchange_mp_end(a, bexchange, ierr)
+#endif
+
+	void *a;
+	BExchange **bexchange;
+	MPI_Fint *ierr;
+{
+	*ierr = P_BExchange_mp_end(a, *bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_bexchange_mp_free_ ( bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_bexchange_mp_free__ ( bexchange, ierr )
+#else
+void p_bexchange_mp_free ( bexchange, ierr )
+#endif
+
+	BExchange **bexchange;
+	MPI_Fint *ierr;
+{
+	P_BExchange_mp_free(*bexchange);
+	free(*bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_bexchange_mp_ ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_bexchange_mp__ ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, ierr )
+#else
+void p_bexchange_mp ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, ierr )
+#endif
+
+	void *a;
+	MPI_Fint *ndims, *stride, *blklen, *bdim, *overlap, *period, *ierr;
+	MPI_Fint *datatype;
+	MPI_Fint *comm;
+{
+	*ierr = P_BExchange_mp(a, (int)*ndims, (int *)stride, (int *)blklen,
+		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype),
+		MPI_Comm_f2c(*comm), (int)*period);
+}
+// -------------------------------------------------------------------------- //
+
+// v.1.4 - persistent exchanges for manual packing //
+// -------------------------------------------------------------------------- //
+#ifdef FORTRANUNDERSCORE
+void pst_bexchange_mp_init_ ( ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_bexchange_mp_init__ ( ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, bexchange, ierr )
+#else
+void pst_bexchange_mp_init ( ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, bexchange, ierr )
+#endif
+
+	MPI_Fint *datatype, *ndims, *stride, *blklen, *bdim, *overlap,
+		*period, *ierr, *comm;
+	BExchange **bexchange;
+{
+	*bexchange = (BExchange *)malloc(sizeof (BExchange));
+	*ierr = PST_BExchange_mp_init((int)*ndims, (int *)stride, (int *)blklen,
+		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype),
+		MPI_Comm_f2c(*comm), (int)*period, *bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_bexchange_mp_start_ ( a, bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_bexchange_mp_start__ ( a, bexchange, ierr )
+#else
+void pst_bexchange_mp_start ( a, bexchange, ierr )
+#endif
+
+	void *a;
+	BExchange **bexchange;
+	MPI_Fint *ierr;
+{
+	*ierr = PST_BExchange_mp_start(a, *bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_bexchange_mp_end_ ( a, bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_bexchange_mp_end__ ( a, bexchange, ierr )
+#else
+void pst_bexchange_mp_end(a, bexchange, ierr)
+#endif
+
+	void *a;
+	BExchange **bexchange;
+	MPI_Fint *ierr;
+{
+	*ierr = PST_BExchange_mp_end(a, *bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_bexchange_mp_free_ ( bexchange, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_bexchange_mp_free__ ( bexchange, ierr )
+#else
+void pst_bexchange_mp_free ( bexchange, ierr )
+#endif
+
+	BExchange **bexchange;
+	MPI_Fint *ierr;
+{
+	PST_BExchange_mp_free(*bexchange);
+	free(*bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_bexchange_mp_ ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_bexchange_mp__ ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, ierr )
+#else
+void pst_bexchange_mp ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, ierr )
+#endif
+
+	void *a;
+	MPI_Fint *ndims, *stride, *blklen, *bdim, *overlap, *period, *ierr;
+	MPI_Fint *datatype;
+	MPI_Fint *comm;
+{
+	*ierr = PST_BExchange_mp(a, (int)*ndims, (int *)stride, (int *)blklen,
+		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype),
+		MPI_Comm_f2c(*comm), (int)*period);
+}
+// -------------------------------------------------------------------------- //
+
+// v.1.95 - choice subroutines //
+// -------------------------------------------------------------------------- //
+#ifdef FORTRANUNDERSCORE
+void p_bexchange_opt_init_ ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, bexchange, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_bexchange_opt_init__ ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, bexchange, exch_mode, ierr )
+#else
+void p_bexchange_opt_init ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, bexchange, exch_mode, ierr )
+#endif
+
+	void *a;
+	MPI_Fint *datatype, *ndims, *stride, *blklen, *bdim, *overlap,
+		*period, *ierr, *comm, *exch_mode;
+	BExchange **bexchange;
+{
+	*bexchange = (BExchange *) malloc ( sizeof (BExchange) );
+	*ierr = P_BExchange_opt_init ( a, (int)*ndims, (int *)stride, (int *)blklen,
+		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype), 
+		MPI_Comm_f2c(*comm), (int)*period, *bexchange, (int)*exch_mode );
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_bexchange_opt_start_ ( a, bexchange, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_bexchange_opt_start__ ( a, bexchange, exch_mode, ierr )
+#else
+void p_bexchange_opt_start ( a, bexchange, exch_mode, ierr )
+#endif
+
+	void *a;
+	BExchange **bexchange;
+	MPI_Fint *ierr, *exch_mode;
+{
+	*ierr = P_BExchange_opt_start(a, *bexchange, (int)*exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_bexchange_opt_end_ ( a, bexchange, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_bexchange_opt_end__ ( a, bexchange, exch_mode, ierr )
+#else
+void p_bexchange_opt_end ( a, bexchange, exch_mode, ierr )
+#endif
+
+	void *a;
+	BExchange **bexchange;
+	MPI_Fint *ierr, *exch_mode;
+{
+	*ierr = P_BExchange_opt_end(a, *bexchange, (int)*exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_bexchange_opt_free_ ( bexchange, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_bexchange_opt_free__ ( bexchange, exch_mode, ierr )
+#else
+void p_bexchange_opt_free ( bexchange, exch_mode, ierr )
+#endif
+
+	BExchange **bexchange;
+	MPI_Fint *ierr, *exch_mode;
+{
+	P_BExchange_opt_free(*bexchange, (int)*exch_mode);
+	free(*bexchange);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_bexchange_opt_ ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_bexchange_opt__ ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, exch_mode, ierr )
+#else
+void p_bexchange_opt ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, exch_mode, ierr )
+#endif
+
 	void *a;
-   int *ndims, *stride, *blklen, *bdim, *overlap, *period, *ierr;
-   MPI_Datatype *datatype;
-   MPI_Comm *comm;
+	MPI_Fint *ndims, *stride, *blklen, *bdim, *overlap, *period, *ierr;
+	MPI_Fint *datatype;
+	MPI_Fint *comm;
+	MPI_Fint *exch_mode;
 {
-   *ierr = P_BExchange ( a, (int)*ndims, (int *)stride, (int *)blklen, 
+	*ierr = P_BExchange_opt(a, (int)*ndims, (int *)stride, (int *)blklen,
+		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype),
+		MPI_Comm_f2c(*comm), (int)*period, (int)*exch_mode);
+}
+// -------------------------------------------------------------------------- //
+
+
+// v.2.0 - regular communications [removed only on correct program exit] //
+// -------------------------------------------------------------------------- //
+#ifdef FORTRANUNDERSCORE
+void reg_bexchange_ ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, exch_id, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void reg_bexchange__ ( a, ndims, stride, blklen, bdim, overlap, datatype,
+		comm, period, exch_id, exch_mode, ierr )
+#else
+void reg_bexchange ( a, ndims, stride, blklen, bdim, overlap, datatype, 
+		comm, period, exch_id, exch_mode, ierr )
+#endif
+
+	void *a;
+	MPI_Fint *datatype, *ndims, *stride, *blklen, *bdim, *overlap,
+		*period, *ierr, *comm, *exch_id, *exch_mode;
+{
+	BExchange *bexchange;
+	bexchange = (BExchange *) malloc ( sizeof (BExchange) );
+	
+	*ierr = P_BExchange_opt_init ( a, (int)*ndims, (int *)stride, (int *)blklen,
 		(int)*bdim, (int *)overlap, MPI_Type_f2c(*datatype), 
-		MPI_Comm_f2c(*comm), (int)*period );
+		MPI_Comm_f2c(*comm), (int)*period, bexchange, (int)*exch_mode );
+	if ((int)*ierr != 0) return;
+
+	*exch_id = save_bexch_handle(bexchange, (int)*exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void start_bexchange_ ( a, exch_id, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void start_bexchange__ ( a, exch_id, ierr )
+#else
+void start_bexchange ( a, exch_id, ierr )
+#endif
+
+	void *a;
+	MPI_Fint *ierr, *exch_id;
+{
+	int exch_mode;
+	BExchange *bexchange;
+
+	get_bexch_handle(&bexchange, &exch_mode, (int)*exch_id);
+
+	*ierr = P_BExchange_opt_start(a, bexchange, exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void end_bexchange_ ( a, exch_id, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void end_bexchange__ ( a, exch_id, ierr )
+#else
+void end_bexchange ( a, exch_id, ierr )
+#endif
+
+	void *a;
+	MPI_Fint *ierr, *exch_id;
+{
+	int exch_mode;
+	BExchange *bexchange;
+	
+	get_bexch_handle(&bexchange, &exch_mode, (int)*exch_id);
+
+	*ierr = P_BExchange_opt_end(a, bexchange, exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void run_bexchange_ ( a, exch_id, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void run_bexchange__ ( a, exch_id, ierr )
+#else
+void run_bexchange ( a, exch_id, ierr )
+#endif
+
+	void *a;
+	MPI_Fint *ierr, *exch_id;
+{
+	int exch_mode;
+	BExchange *bexchange;
+	
+	get_bexch_handle(&bexchange, &exch_mode, (int)*exch_id);
+
+	*ierr = P_BExchange_opt_start(a, bexchange, exch_mode);
+	if ((int)*ierr != 0) return;
+
+	*ierr = P_BExchange_opt_end(a, bexchange, exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void unreg_bexchange_ ( exch_id, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void unreg_bexchange__ ( exch_id, ierr )
+#else
+void unreg_bexchange ( exch_id, ierr )
+#endif
+
+	MPI_Fint *ierr, *exch_id;
+{
+	int exch_mode;
+	BExchange *bexchange;
+
+	get_bexch_handle(&bexchange, &exch_mode, (int)*exch_id);
+
+	*ierr = P_BExchange_opt_free(bexchange, exch_mode);
+	if ((int)*ierr != 0) return;
+	free(bexchange);
+
+	remove_bexch_handle((int)*exch_id);
 }
+// -------------------------------------------------------------------------- //
diff --git a/ParLib.src/parlib.c b/ParLib.src/parlib.c
new file mode 100644
index 0000000000000000000000000000000000000000..bf06244ba0b241f258c915c01e23b56378703a0e
--- /dev/null
+++ b/ParLib.src/parlib.c
@@ -0,0 +1,116 @@
+#include "parlib.h"
+#include "plutils.h"
+#include <stdlib.h>
+
+
+// BExchange list [definition]
+// -------------------------------------------------------------------------- //
+BExchange* bexch_hlist[MAX_BEXCH_HANDLES];	// list of BExchange handles
+int bexch_hmode[MAX_BEXCH_HANDLES];			// Bexchange mode for each handle
+int bexch_hptr;								// pointer to available BExchange handle
+// -------------------------------------------------------------------------- //
+
+// Transpose list [definition]
+// -------------------------------------------------------------------------- //
+Transposition* transp_hlist[MAX_TRANSP_HANDLES];	// list of Transpose handles
+int transp_hmode[MAX_TRANSP_HANDLES];				// Transpose mode for each handle
+int transp_hptr;									// pointer to available Transpose handle
+// -------------------------------------------------------------------------- //
+
+
+// ParLib init-deinit
+// -------------------------------------------------------------------------- //
+void ParLib_init()
+{
+	init_plbuf();	// pl-buffers
+
+	bexch_hptr = 0;		// BExchange list
+	transp_hptr = 0;	// Transpose list
+}
+
+void ParLib_deinit()
+{
+	int k, exch_mode;
+
+	// BExchange list
+	BExchange *bexchange;
+	for (k = 0; k < bexch_hptr; k++) {
+		get_bexch_handle(&bexchange, &exch_mode, k);
+
+		P_BExchange_opt_free(bexchange, exch_mode);
+		free(bexchange);
+	}
+	bexch_hptr = 0;
+
+	// Transp list
+	Transposition *transp;
+	for (k = 0; k < transp_hptr; k++) {
+		get_transp_handle(&transp, &exch_mode, k);
+
+		P_Transpose_opt_free(transp, exch_mode);
+		free(transp);
+	}
+	transp_hptr = 0;
+
+	deinit_plbuf();	// pl-buffers
+}
+// -------------------------------------------------------------------------- //
+
+
+// BExchange handle list interface
+// -------------------------------------------------------------------------- //
+int save_bexch_handle(BExchange *bexchange, int exch_mode)
+{
+	// saving handle
+	bexch_hlist[bexch_hptr] = bexchange;
+	bexch_hmode[bexch_hptr] = exch_mode;
+	bexch_hptr++;
+
+	return bexch_hptr - 1;
+}
+
+void get_bexch_handle(BExchange** bexchange, int* exch_mode, int exch_id)
+{
+	*bexchange = bexch_hlist[exch_id];
+	*exch_mode = bexch_hmode[exch_id];
+}
+
+void remove_bexch_handle(int exch_id)
+{
+	int k;
+	for (k = exch_id; k < bexch_hptr - 1; k++) {
+		bexch_hlist[k] = bexch_hlist[k + 1];
+		bexch_hmode[k] = bexch_hmode[k + 1];
+	}
+	if (bexch_hptr > 0) bexch_hptr--;
+}
+// -------------------------------------------------------------------------- //
+
+// Transposition handle list interface
+// -------------------------------------------------------------------------- //
+int save_transp_handle(Transposition *transp, int exch_mode)
+{
+	// saving handle
+	transp_hlist[transp_hptr] = transp;
+	transp_hmode[transp_hptr] = exch_mode;
+	transp_hptr++;
+
+	return transp_hptr - 1;
+}
+
+void get_transp_handle(Transposition** transp, int* exch_mode, int exch_id)
+{
+	*transp = transp_hlist[exch_id];
+	*exch_mode = transp_hmode[exch_id];
+}
+
+void remove_transp_handle(int exch_id)
+{
+	int k;
+	for (k = exch_id; k < transp_hptr - 1; k++) {
+		transp_hlist[k] = transp_hlist[k + 1];
+		transp_hmode[k] = transp_hmode[k + 1];
+	}
+	if (transp_hptr > 0) transp_hptr--;
+}
+// -------------------------------------------------------------------------- //
diff --git a/ParLib.src/parlib.h b/ParLib.src/parlib.h
index 52f27c30eb0a03ccf373a62aa62e60081cefd98a..f02678a5956a2149c40668482a82fd3ccca593e9 100644
--- a/ParLib.src/parlib.h
+++ b/ParLib.src/parlib.h
@@ -1,36 +1,217 @@
-#ifndef _MPI_INCLUDE
-#include <mpi.h>
-#endif
-
-typedef struct BExchange {
-	int				overlap[2], send[2], recv[2], btype[2]; 
-	int				sendproc[2], recvproc[2], sbind[2], rbind[2];
-	MPI_Comm			comm;
-   MPI_Request		sreq[2], rreq[2]; 
-   MPI_Aint			fsize;
-} BExchange;
-
-typedef struct Transposition {
-	MPI_Datatype *stype, *rtype;
-	int *sbeg, *rbeg;
-	MPI_Comm comm;
-	int nproc, iproc;
-	MPI_Request *sreq, *rreq;
-	MPI_Aint fsize;
-} Transposition;
-
-int P_BExchange_init ( int, int*, int*, int, int*, MPI_Datatype, 
-	MPI_Comm, int, BExchange* );
-int P_BExchange_start ( void*, BExchange* );
-int P_BExchange_end ( BExchange* );
-int P_BExchange_free ( BExchange* );
-int P_BExchange ( void*, int, int*, int*, int, int*, MPI_Datatype,
-	MPI_Comm, int );
-
-int P_Transpose_init ( int , int, int*, int, int*, int*, int*, int*, 
-	MPI_Datatype, MPI_Comm, int, Transposition* );
-int P_Transpose_start ( void*, void*, Transposition* );
-int P_Transpose_end ( Transposition* );
-int P_Transpose_free ( Transposition* );
-int P_Transpose ( int, void*, int, int*, void*, int, int*, int*, int*, 
-	int*, MPI_Datatype, MPI_Comm, int );
+#ifndef _MPI_INCLUDE
+#include <mpi.h>
+#endif
+
+#include "plutils.h"
+
+
+#define IS_MPI_TYPED					0
+#define IS_MPI_MANUAL_PACK				1
+#define IS_MPI_TYPED_PERSISTENT			2
+#define IS_MPI_MANUAL_PACK_PERSISTENT	3
+
+// ParLib v1.8 initialization
+// -------------------------------------------------------------------------- //
+void ParLib_init();
+void ParLib_deinit();
+// -------------------------- //
+
+typedef struct BExchange {
+	int overlap[2], send[2], recv[2];
+	int sendproc[2], recvproc[2], sbind[2], rbind[2];
+	MPI_Datatype btype[2];
+
+	MPI_Comm comm;
+	MPI_Request req[4];
+	MPI_Aint fsize;
+
+	// manual packing data
+	// --------------------------------- //
+	int ndims;
+	int stride[MAX_PARLIB_MP_DIMS];
+
+	int msize[2];
+	int mdims[2][MAX_PARLIB_MP_DIMS];
+
+	void *sbuf[2], *rbuf[2];
+	// --------------------------------- //
+
+	// memory management
+	// --------------------------------- //
+	int buf_id[4];
+	// --------------------------------- //
+} BExchange;
+
+// BExchange list [declaration]
+// -------------------------------------------------------------------------- //
+#define MAX_BEXCH_HANDLES			1024
+
+extern BExchange* bexch_hlist[MAX_BEXCH_HANDLES];	// list of BExchange handles
+extern int bexch_hmode[MAX_BEXCH_HANDLES];			// Bexchange mode for each handle
+extern int bexch_hptr;								// pointer to available BExchange handle
+// -------------------------------------------------------------------------- //
+// BExchange handle list interface
+// -------------------------------------------------------------------------- //
+int save_bexch_handle(BExchange *bexchange, int exch_mode);
+void get_bexch_handle(BExchange** bexchange, int* exch_mode, int exch_id);
+void remove_bexch_handle(int exch_id);
+// -------------------------------------------------------------------------- //
+
+
+// -------------------------------------------------------------------------- //
+int P_BExchange_init ( int, int*, int*, int, int*, MPI_Datatype, 
+	MPI_Comm, int, BExchange* );
+int P_BExchange_start ( void*, BExchange* );
+int P_BExchange_end ( BExchange* );
+int P_BExchange_free ( BExchange* );
+int P_BExchange ( void*, int, int*, int*, int, int*, MPI_Datatype,
+	MPI_Comm, int );
+// -------------------------------------------------------------------------- //
+
+// v.1.3 - persistent exchanges //
+// -------------------------------------------------------------------------- //
+int PST_BExchange_init(void*, int, int*, int*, int, int*, MPI_Datatype,
+	MPI_Comm, int, BExchange*);
+int PST_BExchange_start(BExchange*);
+int PST_BExchange_end(BExchange*);
+int PST_BExchange_free(BExchange*);
+int PST_BExchange(void*, int, int*, int*, int, int*, MPI_Datatype,
+	MPI_Comm, int);
+// -------------------------------------------------------------------------- //
+
+// v.1.4 - manual packing //
+// -------------------------------------------------------------------------- //
+int P_BExchange_mp_init(int, int*, int*, int, int*, MPI_Datatype,
+	MPI_Comm, int, BExchange*);
+int P_BExchange_mp_start(void*, BExchange*);
+int P_BExchange_mp_end(void*, BExchange*);
+int P_BExchange_mp_free(BExchange*);
+int P_BExchange_mp(void*, int, int*, int*, int, int*, MPI_Datatype,
+	MPI_Comm, int);
+// -------------------------------------------------------------------------- //
+
+// v.1.4 - persistent exchanges for manual packing //
+// -------------------------------------------------------------------------- //
+int PST_BExchange_mp_init(int, int*, int*, int, int*, MPI_Datatype,
+	MPI_Comm, int, BExchange*);
+int PST_BExchange_mp_start(void*, BExchange*);
+int PST_BExchange_mp_end(void*, BExchange*);
+int PST_BExchange_mp_free(BExchange*);
+int PST_BExchange_mp(void*, int, int*, int*, int, int*, MPI_Datatype,
+	MPI_Comm, int);
+// -------------------------------------------------------------------------- //
+
+
+// v.1.95 - choice subroutines //
+// -------------------------------------------------------------------------- //
+int P_BExchange_opt_init(void*, int, int*, int*, int, int*, MPI_Datatype,
+	MPI_Comm, int, BExchange*, int);
+int P_BExchange_opt_start(void*, BExchange*, int);
+int P_BExchange_opt_end(void*, BExchange*, int);
+int P_BExchange_opt_free(BExchange*, int);
+int P_BExchange_opt(void*, int, int*, int*, int, int*, MPI_Datatype,
+	MPI_Comm, int, int);
+// -------------------------------------------------------------------------- //
+
+
+typedef struct Transposition {
+	void *psrc, *pdest;		// used only in persistent-type communications
+
+	MPI_Datatype *stype, *rtype;
+	int *sbeg, *rbeg;
+	MPI_Comm comm;
+
+	int nproc, iproc;
+	MPI_Request *req;
+	MPI_Aint fsize;
+
+	// manual packing data
+	// --------------------------------- //
+	int ndims;
+	int sstride[MAX_PARLIB_MP_DIMS], rstride[MAX_PARLIB_MP_DIMS];
+
+	int **sdims, **rdims;
+	int *ssize, *rsize;
+
+	void **sbuf, **rbuf;		// [nproc] buffers
+	// --------------------------------- //
+
+	// memory management
+	// --------------------------------- //
+	int *mem_dims;					// [nproc * ndims] pool
+	void *mem_sbuf, *mem_rbuf;		// [nproc * sum(msize)] pools
+
+	int buf_id[9];
+	// --------------------------------- //
+} Transposition;
+
+// Transpose list [declaration]
+// -------------------------------------------------------------------------- //
+#define MAX_TRANSP_HANDLES			128
+
+extern Transposition* transp_hlist[MAX_TRANSP_HANDLES];	// list of Transpose handles
+extern int transp_hmode[MAX_TRANSP_HANDLES];			// Transpose mode for each handle
+extern int transp_hptr;									// pointer to available Transpose handle
+// -------------------------------------------------------------------------- //
+// Transposition handle list interface
+// -------------------------------------------------------------------------- //
+int save_transp_handle(Transposition *transp, int exch_mode);
+void get_transp_handle(Transposition** transp, int* exch_mode, int exch_id);
+void remove_transp_handle(int exch_id);
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_init ( int , int, int*, int, int*, int*, int*, int*, 
+	MPI_Datatype, MPI_Comm, int, Transposition* );
+int P_Transpose_start ( void*, void*, Transposition* );
+int P_Transpose_end ( Transposition* );
+int P_Transpose_free ( Transposition* );
+int P_Transpose ( int, void*, int, int*, void*, int, int*, int*, int*, 
+	int*, MPI_Datatype, MPI_Comm, int );
+// -------------------------------------------------------------------------- //
+
+// v.1.95 - persistent exchanges //
+// -------------------------------------------------------------------------- //
+int PST_Transpose_init(int, void*, int, int*, void*, int, int*, int*, int*, int*,
+	MPI_Datatype, MPI_Comm, int, Transposition*);
+int PST_Transpose_start(Transposition*);
+int PST_Transpose_end(Transposition*);
+int PST_Transpose_free(Transposition*);
+int PST_Transpose(int, void*, int, int*, void*, int, int*, int*, int*,
+	int*, MPI_Datatype, MPI_Comm, int);
+// -------------------------------------------------------------------------- //
+
+// v.1.7 - manual packing //
+// -------------------------------------------------------------------------- //
+int P_Transpose_mp_init(int, int, int*, int, int*, int*, int*, int*,
+	MPI_Datatype, MPI_Comm, int, Transposition*);
+int P_Transpose_mp_start(void*, void*, Transposition*);
+int P_Transpose_mp_end(void*, void*, Transposition*);
+int P_Transpose_mp_free(Transposition*);
+int P_Transpose_mp(int, void*, int, int*, void*, int, int*, int*, int*,
+	int*, MPI_Datatype, MPI_Comm, int);
+// -------------------------------------------------------------------------- //
+
+// v.1.95 - persistent exchanges for manual packing //
+// -------------------------------------------------------------------------- //
+int PST_Transpose_mp_init(int, void*, int, int*, void*, int, int*, int*, int*, int*,
+	MPI_Datatype, MPI_Comm, int, Transposition*);
+int PST_Transpose_mp_start(void*, void*, Transposition*);
+int PST_Transpose_mp_end(void*, void*, Transposition*);
+int PST_Transpose_mp_free(Transposition*);
+int PST_Transpose_mp(int, void*, int, int*, void*, int, int*, int*, int*,
+	int*, MPI_Datatype, MPI_Comm, int);
+// -------------------------------------------------------------------------- //
+
+
+// v.1.95 - choice subroutines //
+// -------------------------------------------------------------------------- //
+int P_Transpose_opt_init(int, void*, int, int*, void*, int, int*, int*, int*, int*,
+	MPI_Datatype, MPI_Comm, int, Transposition*, int);
+int P_Transpose_opt_start(void*, void*, Transposition*, int);
+int P_Transpose_opt_end(void*, void*, Transposition*, int);
+int P_Transpose_opt_free(Transposition*, int);
+int P_Transpose_opt(int, void*, int, int*, void*, int, int*, int*, int*,
+	int*, MPI_Datatype, MPI_Comm, int, int);
+// -------------------------------------------------------------------------- //
+
diff --git a/ParLib.src/parlibf.c b/ParLib.src/parlibf.c
index d6b42ea0d92ccf16f19fdb3eadcb3ed2201068a2..620ad39b539c8672d0ac5b5e59bd07c4020a3629 100644
--- a/ParLib.src/parlibf.c
+++ b/ParLib.src/parlibf.c
@@ -1,6 +1,7 @@
+#include <stdlib.h>
 #include "parlib.h"
 
-// Additionall calls to make this version 1.1 compliant with version 2.1
+// ParLib v1.8 initialization
 // -------------------------------------------------------------------------- //
 #ifdef FORTRANUNDERSCORE
 void parlib_init_()
@@ -10,6 +11,7 @@ void parlib_init__()
 void parlib_init()
 #endif
 {
+	ParLib_init();
 }
 
 #ifdef FORTRANUNDERSCORE
@@ -20,5 +22,6 @@ void parlib_deinit__()
 void parlib_deinit()
 #endif
 {
+	ParLib_deinit();
 }
 // -------------------------------------------------------------------------- //
diff --git a/ParLib.src/parlibf.h b/ParLib.src/parlibf.h
index 5305b1ab4f5bd2592f8f9674194713e46e14fb8b..f1be3578118265310db6692cbd9f5ec642a2f41d 100644
--- a/ParLib.src/parlibf.h
+++ b/ParLib.src/parlibf.h
@@ -1,2 +1,17 @@
       INTEGER HANDLE_SIZE 
       PARAMETER (HANDLE_SIZE = 2) 
+
+      INTEGER MAX_PARLIB_MP_DIMS
+	  PARAMETER (MAX_PARLIB_MP_DIMS = 6)
+
+      INTEGER IS_MPI_TYPED
+      PARAMETER (IS_MPI_TYPED = 0)
+
+      INTEGER IS_MPI_MANUAL_PACK
+      PARAMETER (IS_MPI_MANUAL_PACK = 1)
+
+      INTEGER IS_MPI_TYPED_PERSISTENT
+      PARAMETER (IS_MPI_TYPED_PERSISTENT = 2)
+
+      INTEGER IS_MPI_MANUAL_PACK_PERSISTENT
+      PARAMETER (IS_MPI_MANUAL_PACK_PERSISTENT = 3)
diff --git a/ParLib.src/plutils.c b/ParLib.src/plutils.c
new file mode 100644
index 0000000000000000000000000000000000000000..453b4a329266bf85884ccfbe5f0166548777032d
--- /dev/null
+++ b/ParLib.src/plutils.c
@@ -0,0 +1,719 @@
+#include "plutils.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+// parlib buffers [definition]
+// -------------------------------------------------------------------------- //
+void *plbuf[MAX_PL_BUFS];
+int plbuf_size[MAX_PL_BUFS];
+int plbuf_status[MAX_PL_BUFS];
+
+int plbuf_ptr;
+// -------------------------------------------------------------------------- //
+
+// using loop_count pragma expectations
+// ----------------------------------------------------------------------------------- //
+#define CP_EXPECT_5D_J1		21			// number of vertical levels[1]
+#define CP_EXPECT_5D_J2		73			// number of vertical levels[2]
+#define CP_EXPECT_5D_K1		1			// number of variables[1]
+#define CP_EXPECT_5D_K2		2			// number of variables[2]
+#define CP_EXPECT_5D_K3		5			// number of variables[3]
+#define CP_EXPECT_5D_K4		10			// number of variables[4]
+#define CP_EXPECT_5D_Q1		1			// number of time scices[1]
+#define CP_EXPECT_5D_Q2		2			// number of time scices[2]
+// ----------------------------------------------------------------------------------- //
+
+
+// buffer memory interface
+// ----------------------------------------------------------------------------------- //
+void init_plbuf()
+{
+	int k;
+	for (k = 0; k < MAX_PL_BUFS; k++) {
+		plbuf_size[k] = 0;
+		plbuf_status[k] = 0;
+	}
+	plbuf_ptr = 0;
+}
+
+void deinit_plbuf()
+{
+	int k;
+	for (k = 0; k < MAX_PL_BUFS; k++) {
+		if (plbuf_size[k] > 0) {
+			free(plbuf[k]);
+			plbuf_size[k] = 0;
+		}
+		plbuf_status[k] = 0;
+	}
+	plbuf_ptr = 0;
+}
+
+
+void* get_plbuf(int msize, int* id)
+{
+	int k, kbeg = plbuf_ptr;
+
+	for (k = kbeg; k < MAX_PL_BUFS; k++) {
+		if (!plbuf_status[k]) {
+			if (msize > plbuf_size[k]) {
+				if (plbuf_size[k] > 0) free(plbuf[k]);
+				plbuf_size[k] = msize;
+				plbuf[k] = (void*)malloc(plbuf_size[k]);
+			}
+
+			plbuf_status[k] = 1;
+			plbuf_ptr = k + 1;
+
+			(*id) = k;
+			return plbuf[k];
+		}
+	}
+
+	// no free buffer found:
+	(*id) = MAX_PL_BUFS;
+	return (void*)malloc(msize);
+}
+
+void free_plbuf(void* ptr, int id)
+{
+	if (id < 0) return;
+	if (id >= MAX_PL_BUFS) {
+		free(ptr);
+		return;
+	}
+
+	plbuf_status[id] = 0;
+	if (id < plbuf_ptr) plbuf_ptr = id;
+}
+// ----------------------------------------------------------------------------------- //
+
+// 1D copy
+// ----------------------------------------------------------------------------------- //
+inline void copy_to_buffer_1d(char* _RESTRICT buf, const char* _RESTRICT const a,
+	const int nx)
+{
+	int i;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+	for (i = 0; i < nx; i++)
+		buf[i] = a[i];
+	else
+		memcpy(buf, a, nx * sizeof(char));
+}
+
+inline void copy_from_buffer_1d(char* _RESTRICT a, const char* _RESTRICT const buf,
+	const int nx)
+{
+	int i;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+	for (i = 0; i < nx; i++)
+		a[i] = buf[i];
+	else
+		memcpy(a, buf, nx * sizeof(char));
+}
+// ----------------------------------------------------------------------------------- //
+
+// 2D copy
+// ----------------------------------------------------------------------------------- //
+inline void copy_to_buffer_2d(char* _RESTRICT buf, const char* _RESTRICT const a,
+	const int nx, const int ny,
+	const int shx)
+{
+	int i, j, idx = 0, bidx = 0;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+	for (j = 0; j < ny; j++, idx += shx, bidx += nx)
+	for (i = 0; i < nx; i++) {
+		buf[bidx + i] = a[idx + i];
+	}
+	else
+	{
+		const int nbx = nx * sizeof(char);
+		for (j = 0; j < ny; j++, bidx += nx) {
+			idx = j * shx;
+			memcpy(&buf[bidx], &a[idx], nbx);
+		}
+	}
+}
+
+inline void copy_from_buffer_2d(char* _RESTRICT a, const char* _RESTRICT const buf,
+	const int nx, const int ny,
+	const int shx)
+{
+	int i, j, idx = 0, bidx = 0;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+	for (j = 0; j < ny; j++, idx += shx, bidx += nx)
+	for (i = 0; i < nx; i++) {
+		a[idx + i] = buf[bidx + i];
+	}
+	else
+	{
+		const int nbx = nx * sizeof(char);
+		for (j = 0; j < ny; j++, bidx += nx) {
+			idx = j * shx;
+			memcpy(&a[idx], &buf[bidx], nbx);
+		}
+	}
+}
+// ----------------------------------------------------------------------------------- //
+
+// 3D copy
+// ----------------------------------------------------------------------------------- //
+inline void copy_to_buffer_3d(char* _RESTRICT buf, const char* _RESTRICT const a,
+	const int nx, const int ny, const int nz,
+	const int shx, const int shxy)
+{
+	int i, j, k, idx, bidx = 0;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+	for (k = 0; k < nz; k++)
+	{
+		idx = k * shxy;
+		for (j = 0; j < ny; j++, idx += shx, bidx += nx)
+		for (i = 0; i < nx; i++) {
+			buf[bidx + i] = a[idx + i];
+		}
+	}
+	else
+	{
+		const int nbx = nx * sizeof(char);
+
+		for (k = 0; k < nz; k++)
+		{
+			idx = k * shxy;
+			for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
+				memcpy(&buf[bidx], &a[idx], nbx);
+			}
+		}
+	}
+}
+
+inline void copy_from_buffer_3d(char* _RESTRICT a, const char* _RESTRICT const buf,
+	const int nx, const int ny, const int nz,
+	const int shx, const int shxy)
+{
+	int i, j, k, idx, bidx = 0;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+	for (k = 0; k < nz; k++)
+	{
+		idx = k * shxy;
+		for (j = 0; j < ny; j++, idx += shx, bidx += nx)
+		for (i = 0; i < nx; i++) {
+			a[idx + i] = buf[bidx + i];
+		}
+	}
+	else
+	{
+		const int nbx = nx * sizeof(char);
+
+		for (k = 0; k < nz; k++)
+		{
+			idx = k * shxy;
+			for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
+				memcpy(&a[idx], &buf[bidx], nbx);
+			}
+		}
+	}
+}
+// ----------------------------------------------------------------------------------- //
+
+// 4D copy
+// ----------------------------------------------------------------------------------- //
+inline void copy_to_buffer_4d(char* _RESTRICT buf, const char* _RESTRICT const a,
+	const int nx, const int ny, const int nz, const int np,
+	const int shx, const int shxy, const int shxyz)
+{
+	int i, j, k, p, idx, shidx, bidx = 0;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+	for (p = 0; p < np; p++)
+	{
+		shidx = p * shxyz;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
+#endif
+		for (k = 0; k < nz; k++, shidx += shxy)
+		{
+			idx = shidx;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
+#endif
+			for (j = 0; j < ny; j++, idx += shx, bidx += nx)
+			for (i = 0; i < nx; i++) {
+				buf[bidx + i] = a[idx + i];
+			}
+		}
+	}
+	else
+	for (p = 0; p < np; p++)
+	{
+		shidx = p * shxyz;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
+#endif
+		for (k = 0; k < nz; k++, shidx += shxy)
+		{
+			idx = shidx;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
+#endif
+			for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
+				memcpy(&buf[bidx], &a[idx], nx * sizeof(char));
+			}
+		}
+	}
+}
+
+inline void copy_from_buffer_4d(char* _RESTRICT a, const char* _RESTRICT const buf,
+	const int nx, const int ny, const int nz, const int np,
+	const int shx, const int shxy, const int shxyz)
+{
+	int i, j, k, p, idx, shidx, bidx = 0;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+	for (p = 0; p < np; p++)
+	{
+		shidx = p * shxyz;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
+#endif
+		for (k = 0; k < nz; k++, shidx += shxy)
+		{
+			idx = shidx;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
+#endif
+			for (j = 0; j < ny; j++, idx += shx, bidx += nx)
+			for (i = 0; i < nx; i++) {
+				a[idx + i] = buf[bidx + i];
+			}
+		}
+	}
+	else
+	for (p = 0; p < np; p++)
+	{
+		shidx = p * shxyz;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
+#endif
+		for (k = 0; k < nz; k++, shidx += shxy)
+		{
+			idx = shidx;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
+#endif
+			for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
+				memcpy(&a[idx], &buf[bidx], nx * sizeof(char));
+			}
+		}
+	}
+}
+// ----------------------------------------------------------------------------------- //
+
+// 5D copy
+// ----------------------------------------------------------------------------------- //
+inline void copy_to_buffer_5d(char* _RESTRICT buf, const char* _RESTRICT const a,
+	const int nx, const int ny, const int nz, const int np, const int nq,
+	const int shx, const int shxy, const int shxyz, const int shxyzp)
+{
+	int i, j, k, p, q, shidx_q, shidx_p, idx, bidx = 0;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_Q1, CP_EXPECT_5D_Q2)
+#endif
+	for (q = 0; q < nq; q++)
+	{
+		shidx_q = q * shxyzp;
+		for (p = 0; p < np; p++, shidx_q += shxyz)
+		{
+			shidx_p = shidx_q;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
+#endif
+			for (k = 0; k < nz; k++, shidx_p += shxy)
+			{
+				idx = shidx_p;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
+#endif
+				for (j = 0; j < ny; j++, idx += shx, bidx += nx)
+				for (i = 0; i < nx; i++) {
+					buf[bidx + i] = a[idx + i];
+				}
+			}
+		}
+	}
+	else
+	{
+		const int nbx = nx * sizeof(char);
+
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_Q1, CP_EXPECT_5D_Q2)
+#endif
+		for (q = 0; q < nq; q++)
+		{
+			shidx_q = q * shxyzp;
+			for (p = 0; p < np; p++, shidx_q += shxyz)
+			{
+				shidx_p = shidx_q;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
+#endif
+				for (k = 0; k < nz; k++, shidx_p += shxy)
+				{
+					idx = shidx_p;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
+#endif
+					for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
+						memcpy(&buf[bidx], &a[idx], nbx);
+					}
+				}
+			}
+		}
+	}
+}
+
+inline void copy_from_buffer_5d(char* _RESTRICT a, const char* _RESTRICT const buf,
+	const int nx, const int ny, const int nz, const int np, const int nq,
+	const int shx, const int shxy, const int shxyz, const int shxyzp)
+{
+	int i, j, k, p, q, shidx_q, shidx_p, idx, bidx = 0;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_Q1, CP_EXPECT_5D_Q2)
+#endif
+	for (q = 0; q < nq; q++)
+	{
+		shidx_q = q * shxyzp;
+		for (p = 0; p < np; p++, shidx_q += shxyz)
+		{
+			shidx_p = shidx_q;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
+#endif
+			for (k = 0; k < nz; k++, shidx_p += shxy)
+			{
+				idx = shidx_p;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
+#endif
+				for (j = 0; j < ny; j++, idx += shx, bidx += nx)
+				for (i = 0; i < nx; i++) {
+					a[idx + i] = buf[bidx + i];
+				}
+			}
+		}
+	}
+	else
+	{
+		const int nbx = nx * sizeof(char);
+
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_Q1, CP_EXPECT_5D_Q2)
+#endif
+		for (q = 0; q < nq; q++)
+		{
+			shidx_q = q * shxyzp;
+			for (p = 0; p < np; p++, shidx_q += shxyz)
+			{
+				shidx_p = shidx_q;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_K1, CP_EXPECT_5D_K2, CP_EXPECT_5D_K3, CP_EXPECT_5D_K4)
+#endif
+				for (k = 0; k < nz; k++, shidx_p += shxy)
+				{
+					idx = shidx_p;
+#if defined(__INTEL_COMPILER)
+#pragma loop_count (CP_EXPECT_5D_J1, CP_EXPECT_5D_J2)
+#endif
+					for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
+						memcpy(&a[idx], &buf[bidx], nbx);
+					}
+				}
+			}
+		}
+	}
+}
+// ----------------------------------------------------------------------------------- //
+
+
+// 6D copy
+// ----------------------------------------------------------------------------------- //
+inline void copy_to_buffer_6d(char* _RESTRICT buf, const char* _RESTRICT const a,
+	const int nx, const int ny, const int nz, const int np, const int nq, const int ns,
+	const int shx, const int shxy, const int shxyz, const int shxyzp, const int shxyzpq)
+{
+	int i, j, k, p, q, s, idx, shidx_q, shidx_p, bidx = 0;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+	for (s = 0; s < ns; s++)
+	for (q = 0; q < nq; q++)
+	{
+		shidx_q = s * shxyzpq + q * shxyzp;
+		for (p = 0; p < np; p++, shidx_q += shxyz)
+		{
+			shidx_p = shidx_q;
+			for (k = 0; k < nz; k++, shidx_p += shxy)
+			{
+				idx = shidx_p;
+				for (j = 0; j < ny; j++, idx += shx, bidx += nx)
+				for (i = 0; i < nx; i++) {
+					buf[bidx + i] = a[idx + i];
+				}
+			}
+		}
+	}
+	else
+	{
+		const int nbx = nx * sizeof(char);
+
+		for (s = 0; s < ns; s++)
+		for (q = 0; q < nq; q++)
+		{
+			shidx_q = s * shxyzpq + q * shxyzp;
+			for (p = 0; p < np; p++, shidx_q += shxyz)
+			{
+				shidx_p = shidx_q;
+				for (k = 0; k < nz; k++, shidx_p += shxy)
+				{
+					idx = shidx_p;
+					for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
+						memcpy(&buf[bidx], &a[idx], nbx);
+					}
+				}
+			}
+		}
+	}
+}
+
+inline void copy_from_buffer_6d(char* _RESTRICT a, const char* _RESTRICT const buf,
+	const int nx, const int ny, const int nz, const int np, const int nq, const int ns,
+	const int shx, const int shxy, const int shxyz, const int shxyzp, const int shxyzpq)
+{
+	int i, j, k, p, q, s, idx, shidx_q, shidx_p, bidx = 0;
+
+	if (nx < MIN_MEMCPY_BLOCK)
+	for (s = 0; s < ns; s++)
+	for (q = 0; q < nq; q++)
+	{
+		shidx_q = s * shxyzpq + q * shxyzp;
+		for (p = 0; p < np; p++, shidx_q += shxyz)
+		{
+			shidx_p = shidx_q;
+			for (k = 0; k < nz; k++, shidx_p += shxy)
+			{
+				idx = shidx_p;
+				for (j = 0; j < ny; j++, idx += shx, bidx += nx)
+				for (i = 0; i < nx; i++) {
+					a[idx + i] = buf[bidx + i];
+				}
+			}
+		}
+	}
+	else
+	{
+		const int nbx = nx * sizeof(char);
+
+		for (s = 0; s < ns; s++)
+		for (q = 0; q < nq; q++)
+		{
+			shidx_q = s * shxyzpq + q * shxyzp;
+			for (p = 0; p < np; p++, shidx_q += shxyz)
+			{
+				shidx_p = shidx_q;
+				for (k = 0; k < nz; k++, shidx_p += shxy)
+				{
+					idx = shidx_p;
+					for (j = 0; j < ny; j++, idx += shx, bidx += nx) {
+						memcpy(&a[idx], &buf[bidx], nbx);
+					}
+				}
+			}
+		}
+	}
+}
+// ----------------------------------------------------------------------------------- //
+
+
+// COPY-TO
+// ----------------------------------------------------------------------------------- //
+void copy_to_buffer(char* _RESTRICT buf, const char* _RESTRICT const a,
+	const int ndims,
+	const int* _RESTRICT const msgdim,
+	const int* _RESTRICT const stride,
+	const int fsize)
+{
+	if ((ndims < 1) || (ndims > MAX_PARLIB_MP_DIMS)) return;
+
+	if (ndims == 1) {
+		const int nx = msgdim[0] * fsize;
+
+		copy_to_buffer_1d(buf, a, nx);
+		return;
+	}
+
+	if (ndims == 2) {
+		const int nx = msgdim[0] * fsize,
+			ny = msgdim[1];
+
+		const int shx = stride[0] * fsize;
+
+		copy_to_buffer_2d(buf, a, nx, ny, 
+			shx);
+		return;
+	}
+
+	if (ndims == 3) {
+		const int nx = msgdim[0] * fsize,
+			ny = msgdim[1], nz = msgdim[2];
+
+		const int shx = stride[0] * fsize;
+		const int shxy = stride[1] * shx;
+
+		copy_to_buffer_3d(buf, a, nx, ny, nz,
+			shx, shxy);
+		return;
+	}
+
+	if (ndims == 4) {
+		const int nx = msgdim[0] * fsize,
+			ny = msgdim[1], nz = msgdim[2], 
+			np = msgdim[3];
+
+		const int shx = stride[0] * fsize;
+		const int shxy = stride[1] * shx;
+		const int shxyz = stride[2] * shxy;
+
+		copy_to_buffer_4d(buf, a, nx, ny, nz, np,
+			shx, shxy, shxyz);
+		return;
+	}
+
+	if (ndims == 5) {
+		const int nx = msgdim[0] * fsize,
+			ny = msgdim[1], nz = msgdim[2], 
+			np = msgdim[3], nq = msgdim[4];
+
+		const int shx = stride[0] * fsize;
+		const int shxy = stride[1] * shx;
+		const int shxyz = stride[2] * shxy;
+		const int shxyzp = stride[3] * shxyz;
+
+		copy_to_buffer_5d(buf, a, nx, ny, nz, np, nq,
+			shx, shxy, shxyz, shxyzp);
+		return;
+	}
+
+	if (ndims == 6) {
+		const int nx = msgdim[0] * fsize, 
+			ny = msgdim[1], nz = msgdim[2],
+			np = msgdim[3], nq = msgdim[4],
+			ns = msgdim[5];
+
+		const int shx = stride[0] * fsize;
+		const int shxy = stride[1] * shx;
+		const int shxyz = stride[2] * shxy;
+		const int shxyzp = stride[3] * shxyz;
+		const int shxyzpq = stride[4] * shxyzp;
+
+		copy_to_buffer_6d(buf, a, nx, ny, nz, np, nq, ns,
+			shx, shxy, shxyz, shxyzp, shxyzpq);
+		return;
+	}
+}
+// ----------------------------------------------------------------------------------- //
+
+// COPY-FROM
+// ----------------------------------------------------------------------------------- //
+void copy_from_buffer(char* _RESTRICT a, const char* _RESTRICT const buf,
+	const int ndims,
+	const int* _RESTRICT const msgdim,
+	const int* _RESTRICT const stride,
+	const int fsize)
+{
+	if ((ndims < 1) || (ndims > MAX_PARLIB_MP_DIMS)) return;
+
+	if (ndims == 1) {
+		const int nx = msgdim[0] * fsize;
+
+		copy_from_buffer_1d(a, buf, nx);
+		return;
+	}
+
+	if (ndims == 2) {
+		const int nx = msgdim[0] * fsize,
+			ny = msgdim[1];
+
+		const int shx = stride[0] * fsize;
+
+		copy_from_buffer_2d(a, buf, nx, ny,
+			shx);
+		return;
+	}
+	
+	if (ndims == 3) {
+		const int nx = msgdim[0] * fsize,
+			ny = msgdim[1], nz = msgdim[2];
+
+		const int shx = stride[0] * fsize;
+		const int shxy = stride[1] * shx;
+
+		copy_from_buffer_3d(a, buf, nx, ny, nz,
+			shx, shxy);
+		return;
+	}
+	
+	if (ndims == 4) {
+		const int nx = msgdim[0] * fsize,
+			ny = msgdim[1], nz = msgdim[2],
+			np = msgdim[3];
+
+		const int shx = stride[0] * fsize;
+		const int shxy = stride[1] * shx;
+		const int shxyz = stride[2] * shxy;
+
+		copy_from_buffer_4d(a, buf, nx, ny, nz, np,
+			shx, shxy, shxyz);
+		return;
+	}
+
+	if (ndims == 5) {
+		const int nx = msgdim[0] * fsize,
+			ny = msgdim[1], nz = msgdim[2],
+			np = msgdim[3], nq = msgdim[4];
+
+		const int shx = stride[0] * fsize;
+		const int shxy = stride[1] * shx;
+		const int shxyz = stride[2] * shxy;
+		const int shxyzp = stride[3] * shxyz;
+
+		copy_from_buffer_5d(a, buf, nx, ny, nz, np, nq,
+			shx, shxy, shxyz, shxyzp);
+		return;
+	}
+
+	if (ndims == 6) {
+		const int nx = msgdim[0] * fsize,
+			ny = msgdim[1], nz = msgdim[2],
+			np = msgdim[3], nq = msgdim[4], ns = msgdim[5];
+
+		const int shx = stride[0] * fsize;
+		const int shxy = stride[1] * shx;
+		const int shxyz = stride[2] * shxy;
+		const int shxyzp = stride[3] * shxyz;
+		const int shxyzpq = stride[4] * shxyzp;
+
+		copy_from_buffer_6d(a, buf, nx, ny, nz, np, nq, ns,
+			shx, shxy, shxyz, shxyzp, shxyzpq);
+		return;
+	}
+}
+// ----------------------------------------------------------------------------------- //
diff --git a/ParLib.src/plutils.h b/ParLib.src/plutils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a493fb931affbb39d735995793711518f1b8da6
--- /dev/null
+++ b/ParLib.src/plutils.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#define MAX_PARLIB_MP_DIMS	6		// maximum number of dims for manual packing
+
+#define MIN_MEMCPY_BLOCK	256		// minimum block (in bytes) for memcpy copy (magic number)
+
+#define MAX_PL_BUFS		4096		// maximum number of parlib internal buffers
+
+// _RESTRICT definition 
+// ------------------------------------------------------------------- //
+#if defined(__INTEL_COMPILER)
+#define _RESTRICT restrict
+#elif defined(__GNUC__) && !defined(_WIN32) && !defined(_CYGWIN32__)
+#define _RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define _RESTRICT __restrict
+#else
+#define _RESTRICT
+#endif
+// ------------------------------------------------------------------- //
+
+
+// parlib buffers [declaration]
+// -------------------------------------------------------------------------- //
+extern void *plbuf[MAX_PL_BUFS];
+extern int plbuf_size[MAX_PL_BUFS];
+extern int plbuf_status[MAX_PL_BUFS];
+
+extern int plbuf_ptr;
+// -------------------------------------------------------------------------- //
+
+
+void init_plbuf();
+void deinit_plbuf();
+
+void* get_plbuf(int msize, int* id);
+void free_plbuf(void* ptr, int id);
+// -------------------------------------------------------------------------- //
+
+void copy_to_buffer(char* _RESTRICT buf, const char* _RESTRICT const a,
+	const int ndims,
+	const int* _RESTRICT const msgdim,
+	const int* _RESTRICT const stride,
+	const int fsize);
+
+void copy_from_buffer(char* _RESTRICT a, const char* _RESTRICT const buf,
+	const int ndims,
+	const int* _RESTRICT const msgdim,
+	const int* _RESTRICT const stride,
+	const int fsize);
+// -------------------------------------------------------------------------- //
diff --git a/ParLib.src/transpose.c b/ParLib.src/transpose.c
index 7f6a9619ba5e5ba29af91dd2a216c15acbec8497..1d536aacddc450860654b79d8578722b5d379435 100644
--- a/ParLib.src/transpose.c
+++ b/ParLib.src/transpose.c
@@ -1,275 +1,1188 @@
-#include <stdlib.h>
-#include "parlib.h"
-
-int P_Transpose_init ( ndims, dim_source, lblks_source, dim_dest, 
-		lblks_dest, stride, blklen, overlap, datatype, comm, period,
-		transp )
-	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
-   int *blklen, *overlap;
-	MPI_Datatype datatype;
-   MPI_Comm comm;
-	int period;
-   Transposition *transp;
-{
-	int idim, nproc, iproc, ip, strd, count;
-	int wblka, wblkb, begb;
-	int ifsta, ifstb, idir, suma, sumb;
-	MPI_Aint fsize;
-	MPI_Datatype oldtype, *stype, *rtype;
-	int *sbeg, *rbeg;
-/*
- * Check input parameters
- */
-	if ( ndims < 2) { return 1; }
-	if ( dim_source < 1 || dim_source > ndims ) { return 2; }
-	if ( dim_dest < 1 || dim_dest > ndims ) { return 3; }
-   if ( dim_source == dim_dest ) { return 4; }
-   for ( idim = 0; idim < ndims; idim++ ) {
-		if ( stride[idim] <= 0) { return 5; }
-	}
-   for (idir = 0; idir < 2; idir++ ) {
-		if ( overlap[idir] < 0 ) { return 6; }
-	}
-/*
- * Define the number of processors in the group and the rank
- */
-	MPI_Comm_size ( comm, &nproc );
-	if ( nproc == 0 ) { return 0; } 
-	MPI_Comm_rank ( comm, &iproc );
-	if ( iproc == MPI_UNDEFINED ) { return 0; }
-
-	suma = sumb = 0;
-	for ( ip = 0; ip < nproc; ip++ ) {
-		suma += lblks_source[ip];
-		sumb += lblks_dest[ip];
-		if ( lblks_source[ip] <= 0 ) { return 14; }
-		if ( lblks_dest[ip] <= 0 ) { return 15; }
-	}
-
-	if ( lblks_source[iproc] > blklen[dim_source-1] ) { return 8; }
-   if ( lblks_dest[iproc] > blklen[dim_dest-1] ) { return 9; }
-	if ( suma > stride[dim_source-1] ) { return 10; }
-   if ( sumb > stride[dim_dest-1] ) { return 11; }
-	for ( idim = 0; idim < ndims; idim++ ) {
-		if ( idim != dim_source-1 && idim != dim_dest-1 ) {
-			if ( blklen[idim] > stride[idim] ) { return 7; }
-		}
-	}
-	if ( overlap[0] > lblks_dest[0] ) { return 12; }
-	if ( overlap[1] > lblks_dest[nproc-1] ) { return 13; }
-
-	MPI_Type_extent ( datatype, &fsize );
-/*
- * Allocate memory
- */
-	stype = transp->stype =
-		(MPI_Datatype *) malloc ( sizeof(MPI_Datatype)*nproc ); 
-	rtype = transp->rtype =
-		(MPI_Datatype *) malloc ( sizeof(MPI_Datatype)*nproc ); 
-	sbeg = transp->sbeg = (int *) malloc ( sizeof(int)*nproc ); 
-	rbeg = transp->rbeg = (int *) malloc ( sizeof(int)*nproc ); 
-/*
- * Define data types for the blocks and the beginings of the blocks
- */
-   ifsta = ifstb = 1;
-   for ( ip = 0; ip < nproc; ip++ ) {
-      wblka = lblks_source[iproc];
-      wblkb = lblks_dest[ip];
-      if ( ip > 0 || period ) wblkb += overlap[0];
-      if ( ip < nproc-1 || period ) wblkb += overlap[1];
-
-      oldtype = datatype;
-      strd = 1;
-      for ( idim = 0; idim < ndims; idim++ ) {
-         if ( idim == dim_source-1 ) {
-            count = wblka;
-         } else if ( idim == dim_dest-1 ) {
-            count = wblkb;
-         } else {
-            count = blklen[idim];
-         }
-         MPI_Type_hvector ( count, 1, strd*fsize, oldtype, stype+ip );
-			if ( idim > 0 ) { MPI_Type_free ( &oldtype ); }
-         oldtype = stype[ip];
-         if ( idim == dim_source-1 ) {
-            strd *= blklen[idim];
-         } else {
-            strd *= stride[idim];
-         }
-      }
-      MPI_Type_commit ( stype+ip );
-
-      wblka = lblks_source[ip];
-      wblkb = lblks_dest[iproc];
-      if ( iproc > 0 || period ) wblkb += overlap[0];
-      if ( iproc < nproc-1 || period ) wblkb += overlap[1];
-      oldtype = datatype;
-      strd = 1;
-      for ( idim = 0; idim < ndims; idim++ ) {
-         if ( idim == dim_source-1 ) {
-            count = wblka;
-         } else if ( idim == dim_dest-1 ) {
-            count = wblkb;
-         } else {
-            count = blklen[idim];
-         }
-         MPI_Type_hvector ( count, 1, strd*fsize, oldtype, rtype+ip );
-			if ( idim > 0 ) { MPI_Type_free ( &oldtype ); }
-         oldtype = rtype[ip];
-         if ( idim == dim_dest-1 ) {
-            strd *= blklen[idim];
-         } else {
-            strd *= stride[idim];
-         }
-      }
-      MPI_Type_commit ( rtype+ip );
-
-      begb = ifstb;
-      if ( ip > 0 || period ) begb -= overlap[0];
-      strd = 1;
-      for ( idim = 0; idim < dim_dest-1; idim++ ) {
-         if ( idim == dim_source-1 ) {
-            strd *= blklen[idim];
-         } else {
-            strd *= stride[idim];
-         }
-      }
-      sbeg[ip] = strd*(begb-1);
-
-      rbeg[ip] = 0;
-      if ( iproc > 0 || period ) rbeg[ip] -= overlap[0]*strd;
-
-      strd = 1;
-      for ( idim = 0; idim < dim_source-1; idim++ ) {
-         if ( idim == dim_dest-1 ) {
-            strd *= blklen[idim];
-         } else {
-            strd *= stride[idim];
-         }
-      }
-      rbeg[ip] += strd*(ifsta-1);
-      ifsta += lblks_source[ip];
-      ifstb += lblks_dest[ip];
-	}
-	transp->nproc = nproc;
-	transp->iproc = iproc;
-	transp->comm = comm;
-	transp->fsize = fsize;
-	return 0;
-}
-
-int P_Transpose_start ( arr_source, arr_dest, transp )
-	void *arr_source, *arr_dest;
-	Transposition *transp;
-{
-	char *arr_source_ch = (char *) arr_source;
-	char *arr_dest_ch = (char *) arr_dest;
-	int nproc = transp->nproc;
-	int iproc = transp->iproc;
-	MPI_Aint fsize = transp->fsize;
-	int *sbeg = transp->sbeg;
-	int *rbeg = transp->rbeg;
-	MPI_Datatype *stype = transp->stype;
-	MPI_Datatype *rtype = transp->rtype;
-	MPI_Comm comm = transp->comm;
-	MPI_Request *sreq, *rreq;
-	int ip;
-	char *src, *dest;
-
-   if ( nproc == 0 ) { return 0; }
-   if ( iproc == MPI_UNDEFINED ) { return 0; }
-/*
- * Allocate memory
- */
-	sreq = transp->sreq = 
-		(MPI_Request *) malloc ( sizeof(MPI_Request)*nproc );
-	rreq = transp->rreq = 
-		(MPI_Request *) malloc ( sizeof(MPI_Request)*nproc );
-/*
- * Start the communication
- */
-   for ( ip = 0; ip < nproc; ip++ ) {
-	   dest = arr_dest_ch+rbeg[ip]*fsize;
-		src = arr_source_ch+sbeg[ip]*fsize;
-		if ( dest == src && iproc == ip ) {
-			rreq[ip] = sreq[ip] = MPI_REQUEST_NULL;
-      } else {
-         MPI_Irecv ( dest, 1, rtype[ip], ip, 0, comm, rreq+ip );
-         MPI_Isend ( src, 1, stype[ip], ip, 0, comm, sreq+ip );
-		}
-	}
-	return 0;
-}
-
-int P_Transpose_end ( transp )
-	Transposition *transp;
-{
-	int nproc = transp->nproc;
-	int iproc = transp->iproc;
-	MPI_Request *sreq = transp->sreq; 
-	MPI_Request *rreq = transp->rreq; 
-	int ip;
-	MPI_Status status;
-
-   if ( nproc == 0 ) { return 0; }
-   if ( iproc == MPI_UNDEFINED ) { return 0; }
-
-   for ( ip = 0; ip < nproc; ip++ ) {
-      MPI_Wait ( rreq+ip, &status );
-      MPI_Wait ( sreq+ip, &status );
-   }
-	return 0;
-}
-
-int P_Transpose_free ( transp )
-	Transposition *transp;
-{
-	int nproc = transp->nproc;
-	int iproc = transp->iproc;
-	int *sbeg = transp->sbeg;
-	int *rbeg = transp->rbeg;
-	MPI_Datatype *stype = transp->stype;
-	MPI_Datatype *rtype = transp->rtype;
-	MPI_Request *sreq = transp->sreq; 
-	MPI_Request *rreq = transp->rreq; 
-	int ip;
-
-   if ( nproc == 0 ) { return 0; }
-   if ( iproc == MPI_UNDEFINED ) { return 0; }
-
-   for ( ip = 0; ip < nproc; ip++ ) {
-      MPI_Type_free ( rtype+ip );
-      MPI_Type_free ( stype+ip );
-   }
-	free ( stype );
-	free ( rtype );
-	free ( sbeg );
-   free ( rbeg );
-	free ( sreq );
-   free ( rreq );
-	return 0;
-}
-
-int P_Transpose ( ndims, arr_source, dim_source, lblks_source, arr_dest, 
-		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
-		period )
-	void *arr_source, *arr_dest;
-	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride; 
-	int *blklen, *overlap;
-	MPI_Datatype datatype;
-   MPI_Comm comm;
-	int period;
-{
-   Transposition transp;
-	int ierr;
-	if ( ierr = P_Transpose_init ( ndims, dim_source, lblks_source, 
-		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
-		period, &transp ) != 0 ) 
-	{
-		return ierr;
-	}
-	P_Transpose_start ( arr_source, arr_dest, &transp );
-	P_Transpose_end ( &transp );
-	P_Transpose_free ( &transp );
-	return 0;
-}
+#include "parlib.h"
+#include "plutils.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+
+/*
+*  Error codes:
+*     0 - success
+*     1 - number of dimensions < 2
+*     2 - wrong communicated dimension [source]
+*     3 - wrong communicated dimension [dest]
+*     4 - wrong communicated dimensions [source == dest]
+*     5 - nonpositive dimension
+*     6 - negative boundary width
+*     7 - number of dimensions exceeds maximum value (only for MP - manual packing)
+*/
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_init ( ndims, dim_source, lblks_source, dim_dest, 
+	lblks_dest, stride, blklen, overlap, datatype, comm, period,
+	transp )
+	
+	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
+	int *blklen, *overlap;
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	Transposition *transp;
+{
+	int idim, nproc, iproc, ip, strd, count;
+	int wblka, wblkb, begb;
+	int ifsta, ifstb, idir, suma, sumb;
+	MPI_Aint lb, fsize;
+	MPI_Datatype oldtype, *stype, *rtype;
+	int *sbeg, *rbeg;
+
+// Setting degenerate-success cases conditions for consistency
+//	including: nproc=0, iproc=MPI_UNDEFINED
+	transp->nproc = 0;
+	transp->iproc = MPI_UNDEFINED;
+
+//
+// Check input parameters
+//
+	if (ndims < 2) { return 1; }
+	if (dim_source < 1 || dim_source > ndims) { return 2; }
+	if (dim_dest < 1 || dim_dest > ndims) { return 3; }
+	if (dim_source == dim_dest) { return 4; }
+	for (idim = 0; idim < ndims; idim++) {
+		if (stride[idim] <= 0) { return 5; }
+	}
+	for (idir = 0; idir < 2; idir++) {
+		if (overlap[idir] < 0) { return 6; }
+	}
+//
+// Define the number of processors in the group and the rank
+//
+	if (comm == MPI_COMM_NULL) { return 0; } // empty communicator
+
+	MPI_Comm_size(comm, &nproc);
+	if (nproc == 0) { return 0; }
+	MPI_Comm_rank(comm, &iproc);
+	if (iproc == MPI_UNDEFINED) { return 0; }
+
+	suma = sumb = 0;
+	for (ip = 0; ip < nproc; ip++) {
+		suma += lblks_source[ip];
+		sumb += lblks_dest[ip];
+		if (lblks_source[ip] <= 0) { return 14; }
+		if (lblks_dest[ip] <= 0) { return 15; }
+	}
+
+	if (lblks_source[iproc] > blklen[dim_source - 1]) { return 8; }
+	if (lblks_dest[iproc] > blklen[dim_dest - 1]) { return 9; }
+	if (suma > stride[dim_source - 1]) { return 10; }
+	if (sumb > stride[dim_dest - 1]) { return 11; }
+	for (idim = 0; idim < ndims; idim++) {
+		if (idim != dim_source - 1 && idim != dim_dest - 1) {
+			if (blklen[idim] > stride[idim]) { return 7; }
+		}
+	}
+	if (overlap[0] > lblks_dest[0]) { return 12; }
+	if (overlap[1] > lblks_dest[nproc - 1]) { return 13; }
+
+	MPI_Type_get_extent(datatype, &lb, &fsize);
+//
+// Allocate memory
+//
+	stype = transp->stype = (MPI_Datatype *)get_plbuf(2 * nproc * sizeof(MPI_Datatype),
+		&transp->buf_id[0]);
+	rtype = transp->rtype = &transp->stype[nproc];
+
+	sbeg = transp->sbeg = (int*)get_plbuf(2 * nproc * sizeof(int), 
+		&transp->buf_id[1]);
+	rbeg = transp->rbeg = &transp->sbeg[nproc];
+
+	transp->req = (MPI_Request *)get_plbuf(2 * nproc * sizeof(MPI_Request),
+		&transp->buf_id[2]);
+
+//
+// Define data types for the blocks and the beginings of the blocks
+//
+	ifsta = ifstb = 1;
+	for (ip = 0; ip < nproc; ip++) {
+		wblka = lblks_source[iproc];
+		wblkb = lblks_dest[ip];
+		if (ip > 0 || period) wblkb += overlap[0];
+		if (ip < nproc - 1 || period) wblkb += overlap[1];
+
+		if (dim_source - 1 == 0) {
+			count = wblka;
+		}
+		else if (dim_dest - 1 == 0) {
+			count = wblkb;
+		}
+		else {
+			count = blklen[0];
+		}
+		MPI_Type_contiguous(count, datatype, stype + ip);
+		oldtype = stype[ip];
+		if (dim_source - 1 == 0) {
+			strd = blklen[0];
+		}
+		else {
+			strd = stride[0];
+		}
+
+		for (idim = 1; idim < ndims; idim++) {
+			if (dim_source - 1 == idim) {
+				count = wblka;
+			}
+			else if (dim_dest - 1 == idim) {
+				count = wblkb;
+			}
+			else {
+				count = blklen[idim];
+			}
+			MPI_Type_create_hvector(count, 1, strd*fsize, oldtype, stype + ip);
+			MPI_Type_free(&oldtype);
+			oldtype = stype[ip];
+			if (idim == dim_source - 1) {
+				strd *= blklen[idim];
+			}
+			else {
+				strd *= stride[idim];
+			}
+		}
+		MPI_Type_commit(stype + ip);
+
+		wblka = lblks_source[ip];
+		wblkb = lblks_dest[iproc];
+		if (iproc > 0 || period) wblkb += overlap[0];
+		if (iproc < nproc - 1 || period) wblkb += overlap[1];
+
+		if (dim_source - 1 == 0) {
+			count = wblka;
+		}
+		else if (dim_dest - 1 == 0) {
+			count = wblkb;
+		}
+		else {
+			count = blklen[0];
+		}
+		MPI_Type_contiguous(count, datatype, rtype + ip);
+		oldtype = rtype[ip];
+		if (dim_dest - 1 == 0) {
+			strd = blklen[0];
+		}
+		else {
+			strd = stride[0];
+		}
+
+		for (idim = 1; idim < ndims; idim++) {
+			if (dim_source - 1 == idim) {
+				count = wblka;
+			}
+			else if (dim_dest - 1 == idim) {
+				count = wblkb;
+			}
+			else {
+				count = blklen[idim];
+			}
+			MPI_Type_create_hvector(count, 1, strd*fsize, oldtype, rtype + ip);
+			MPI_Type_free(&oldtype);
+			oldtype = rtype[ip];
+			if (idim == dim_dest - 1) {
+				strd *= blklen[idim];
+			}
+			else {
+				strd *= stride[idim];
+			}
+		}
+		MPI_Type_commit(rtype + ip);
+
+		begb = ifstb;
+		if (ip > 0 || period) begb -= overlap[0];
+		strd = 1;
+		for (idim = 0; idim < dim_dest - 1; idim++) {
+			if (idim == dim_source - 1) {
+				strd *= blklen[idim];
+			}
+			else {
+				strd *= stride[idim];
+			}
+		}
+		sbeg[ip] = strd*(begb - 1);
+
+		rbeg[ip] = 0;
+		if (iproc > 0 || period) rbeg[ip] -= overlap[0] * strd;
+
+		strd = 1;
+		for (idim = 0; idim < dim_source - 1; idim++) {
+			if (idim == dim_dest - 1) {
+				strd *= blklen[idim];
+			}
+			else {
+				strd *= stride[idim];
+			}
+		}
+		rbeg[ip] += strd*(ifsta - 1);
+		ifsta += lblks_source[ip];
+		ifstb += lblks_dest[ip];
+	}
+	transp->nproc = nproc;
+	transp->iproc = iproc;
+	transp->comm = comm;
+	transp->fsize = fsize;
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_start ( arr_source, arr_dest, transp )
+	void *arr_source, *arr_dest;
+	Transposition *transp;
+{
+	char *arr_source_ch = (char *) arr_source;
+	char *arr_dest_ch = (char *) arr_dest;
+	
+	int ip;
+	char *src, *dest;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+//
+// Start the communication
+//
+	for (ip = 0; ip < transp->nproc; ip++) {
+		dest = arr_dest_ch + transp->rbeg[ip] * transp->fsize;
+		src = arr_source_ch + transp->sbeg[ip] * transp->fsize;
+		if ((dest == src) && (transp->iproc == ip)) {
+			transp->req[ip] = MPI_REQUEST_NULL;
+			transp->req[transp->nproc + ip] = MPI_REQUEST_NULL;
+		}
+		else {
+			MPI_Irecv(dest, 1, transp->rtype[ip], 
+				ip, 0, transp->comm,
+				&transp->req[transp->nproc + ip]);
+			
+			MPI_Isend(src, 1, transp->stype[ip], 
+				ip, 0, transp->comm,
+				&transp->req[ip]);
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_end ( transp )
+	Transposition *transp;
+{
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+	MPI_Waitall(2 * transp->nproc, transp->req, MPI_STATUSES_IGNORE);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_free ( transp )
+	Transposition *transp;
+{
+	int ip;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+	for (ip = 0; ip < transp->nproc; ip++) {
+		MPI_Type_free(transp->rtype + ip);
+		MPI_Type_free(transp->stype + ip);
+	}
+
+	free_plbuf(transp->stype, transp->buf_id[0]);
+	free_plbuf(transp->sbeg, transp->buf_id[1]);
+	free_plbuf(transp->req, transp->buf_id[2]);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose ( ndims, arr_source, dim_source, lblks_source, arr_dest, 
+	dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+	period )
+	
+	void *arr_source, *arr_dest;
+	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
+	int *blklen, *overlap;
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+{
+	Transposition transp;
+
+	int ierr;
+	if (ierr = P_Transpose_init(ndims, dim_source, lblks_source,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm,
+		period, &transp) != 0)
+	{
+		return ierr;
+	}
+	P_Transpose_start(arr_source, arr_dest, &transp);
+	P_Transpose_end(&transp);
+	P_Transpose_free(&transp);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+
+// v.1.95 - persistent exchanges //
+// -------------------------------------------------------------------------- //
+int PST_Transpose_init ( ndims, arr_source, dim_source, lblks_source, 
+	arr_dest, dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, period,
+	transp )
+	
+	void *arr_source, *arr_dest;
+	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
+	int *blklen, *overlap;
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	Transposition *transp;
+{
+	char *arr_source_ch = (char *)arr_source;
+	char *arr_dest_ch = (char *)arr_dest;
+
+	int ip;
+	char *src, *dest;
+
+	int ierr = P_Transpose_init(ndims, dim_source, lblks_source,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, period, transp);
+	if (ierr != 0) return ierr;
+
+	transp->psrc = arr_source;
+	transp->pdest = arr_dest;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+//
+// Setup the communication
+//
+	for (ip = 0; ip < transp->nproc; ip++) {
+		dest = arr_dest_ch + transp->rbeg[ip] * transp->fsize;
+		src = arr_source_ch + transp->sbeg[ip] * transp->fsize;
+		if ((dest == src) && (transp->iproc == ip)) {
+			continue;
+		}
+		else {
+			MPI_Recv_init(dest, 1, transp->rtype[ip],
+				ip, 0, transp->comm,
+				&transp->req[transp->nproc + ip]);
+
+			MPI_Send_init(src, 1, transp->stype[ip],
+				ip, 0, transp->comm,
+				&transp->req[ip]);
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_Transpose_start ( transp )
+	Transposition *transp;
+{
+	char *arr_source_ch = (char *)transp->psrc;
+	char *arr_dest_ch = (char *)transp->pdest;
+
+	int ip;
+	char *src, *dest;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+//
+// Start the communication
+//
+	for (ip = 0; ip < transp->nproc; ip++) {
+		dest = arr_dest_ch + transp->rbeg[ip] * transp->fsize;
+		src = arr_source_ch + transp->sbeg[ip] * transp->fsize;
+		if ((dest == src) && (transp->iproc == ip)) {
+			transp->req[ip] = MPI_REQUEST_NULL;
+			transp->req[transp->nproc + ip] = MPI_REQUEST_NULL;
+		}
+		else {
+			MPI_Start(&transp->req[transp->nproc + ip]);
+			MPI_Start(&transp->req[ip]);
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_Transpose_end ( transp )
+	Transposition *transp;
+{
+	return P_Transpose_end(transp);
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_Transpose_free ( transp )
+	Transposition *transp;
+{
+	char *arr_source_ch = (char *)transp->psrc;
+	char *arr_dest_ch = (char *)transp->pdest;
+
+	int ip;
+	char *src, *dest;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+	// we have to free persistent requests first...
+	for (ip = 0; ip < transp->nproc; ip++) {
+		dest = arr_dest_ch + transp->rbeg[ip] * transp->fsize;
+		src = arr_source_ch + transp->sbeg[ip] * transp->fsize;
+		if ((dest == src) && (transp->iproc == ip)) {
+			continue;
+		}
+		else {
+			MPI_Request_free(&transp->req[transp->nproc + ip]);
+			MPI_Request_free(&transp->req[ip]);
+		}
+	}
+
+	return P_Transpose_free(transp);
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_Transpose ( ndims, arr_source, dim_source, lblks_source, arr_dest, 
+	dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+	period )
+	
+	void *arr_source, *arr_dest;
+	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
+	int *blklen, *overlap;
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+{
+	Transposition transp;
+
+	int ierr;
+	if (ierr = PST_Transpose_init(ndims, arr_source, dim_source, lblks_source, 
+		arr_dest, dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm,
+		period, &transp) != 0)
+	{
+		return ierr;
+	}
+	PST_Transpose_start(&transp);
+	PST_Transpose_end(&transp);
+	PST_Transpose_free(&transp);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+
+// v.1.7 - manual packing //
+// -------------------------------------------------------------------------- //
+int P_Transpose_mp_init ( ndims, dim_source, lblks_source, dim_dest, 
+	lblks_dest, stride, blklen, overlap, datatype, comm, period, 
+	transp )
+	
+	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
+	int *blklen, *overlap;
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	Transposition *transp;
+{
+	int idim, nproc, iproc, ip, strd, count;
+	int wblka, wblkb, begb;
+	int ifsta, ifstb, idir, suma, sumb;
+	MPI_Aint lb, fsize;
+	MPI_Datatype oldtype, *stype, *rtype;
+	int *sbeg, *rbeg;
+
+	int **sdims, **rdims;
+	int	*ssize, *rsize;
+	void **sbuf, **rbuf;
+	
+	int sbuf_size, rbuf_size, ssh, rsh;
+
+// Setting degenerate-success cases conditions for consistency
+//	including: nproc=0, iproc=MPI_UNDEFINED
+	transp->nproc = 0;
+	transp->iproc = MPI_UNDEFINED;
+
+//
+// Check input parameters
+//
+	if (ndims < 2) { return 1; }
+	if (dim_source < 1 || dim_source > ndims) { return 2; }
+	if (dim_dest < 1 || dim_dest > ndims) { return 3; }
+	if (dim_source == dim_dest) { return 4; }
+	for (idim = 0; idim < ndims; idim++) {
+		if (stride[idim] <= 0) { return 5; }
+	}
+	for (idir = 0; idir < 2; idir++) {
+		if (overlap[idir] < 0) { return 6; }
+	}
+	if (ndims > MAX_PARLIB_MP_DIMS) return 7;
+//
+// Define the number of processors in the group and the rank
+//
+	if (comm == MPI_COMM_NULL) { return 0; } // empty communicator
+
+	MPI_Comm_size(comm, &nproc);
+	if (nproc == 0) { return 0; }
+	MPI_Comm_rank(comm, &iproc);
+	if (iproc == MPI_UNDEFINED) { return 0; }
+
+	suma = sumb = 0;
+	for (ip = 0; ip < nproc; ip++) {
+		suma += lblks_source[ip];
+		sumb += lblks_dest[ip];
+		if (lblks_source[ip] <= 0) { return 14; }
+		if (lblks_dest[ip] <= 0) { return 15; }
+	}
+
+	if (lblks_source[iproc] > blklen[dim_source - 1]) { return 8; }
+	if (lblks_dest[iproc] > blklen[dim_dest - 1]) { return 9; }
+	if (suma > stride[dim_source - 1]) { return 10; }
+	if (sumb > stride[dim_dest - 1]) { return 11; }
+	for (idim = 0; idim < ndims; idim++) {
+		if (idim != dim_source - 1 && idim != dim_dest - 1) {
+			if (blklen[idim] > stride[idim]) { return 7; }
+		}
+	}
+	if (overlap[0] > lblks_dest[0]) { return 12; }
+	if (overlap[1] > lblks_dest[nproc - 1]) { return 13; }
+
+	MPI_Type_get_extent(datatype, &lb, &fsize);
+//
+// Allocate memory
+//
+	stype = transp->stype = (MPI_Datatype *)get_plbuf(2 * nproc * sizeof(MPI_Datatype),
+		&transp->buf_id[0]);
+	rtype = transp->rtype = &transp->stype[nproc];
+
+	sbeg = transp->sbeg = (int*)get_plbuf(2 * nproc * sizeof(int), 
+		&transp->buf_id[1]);
+	rbeg = transp->rbeg = &transp->sbeg[nproc];
+
+	transp->req = (MPI_Request *)get_plbuf(2 * nproc * sizeof(MPI_Request),
+		&transp->buf_id[2]);
+
+
+	sdims = transp->sdims = (int**)get_plbuf(2 * nproc * sizeof(int*), 
+		&transp->buf_id[3]);
+	rdims = transp->rdims = &transp->sdims[nproc];
+
+	ssize = transp->ssize = (int*)get_plbuf(2 * nproc * sizeof(int), 
+		&transp->buf_id[4]);
+	rsize = transp->rsize = &transp->ssize[nproc];
+
+	sbuf = transp->sbuf = (void**)get_plbuf(2 * nproc * sizeof(void*), 
+		&transp->buf_id[5]);
+	rbuf = transp->rbuf = &transp->sbuf[nproc];
+
+
+	transp->mem_dims = (int*)get_plbuf(2 * nproc * ndims * sizeof(int), 
+		&transp->buf_id[6]);
+	for (ip = 0; ip < nproc; ip++) {
+		sdims[ip] = transp->sdims[ip] = &transp->mem_dims[ip * ndims];
+		rdims[ip] = transp->rdims[ip] = &transp->mem_dims[(nproc + ip) * ndims];
+	}
+
+//
+// Define data types for the blocks and the beginings of the blocks
+//
+	sbuf_size = 0;
+	rbuf_size = 0;
+
+	ifsta = ifstb = 1;
+	for (ip = 0; ip < nproc; ip++) {
+		wblka = lblks_source[iproc];
+		wblkb = lblks_dest[ip];
+		if (ip > 0 || period) wblkb += overlap[0];
+		if (ip < nproc - 1 || period) wblkb += overlap[1];
+
+		ssize[ip] = 1;
+		stype[ip] = datatype;
+
+		for (idim = 0; idim < ndims; idim++) {
+			if (dim_source - 1 == idim) {
+				count = wblka;
+			}
+			else if (dim_dest - 1 == idim) {
+				count = wblkb;
+			}
+			else {
+				count = blklen[idim];
+			}
+
+			sdims[ip][idim] = count;
+			ssize[ip] *= count;
+		}
+		sbuf_size += ssize[ip];
+
+		wblka = lblks_source[ip];
+		wblkb = lblks_dest[iproc];
+		if (iproc > 0 || period) wblkb += overlap[0];
+		if (iproc < nproc - 1 || period) wblkb += overlap[1];
+
+		rsize[ip] = 1;
+		rtype[ip] = datatype;
+
+		for (idim = 0; idim < ndims; idim++) {
+			if (dim_source - 1 == idim) {
+				count = wblka;
+			}
+			else if (dim_dest - 1 == idim) {
+				count = wblkb;
+			}
+			else {
+				count = blklen[idim];
+			}
+
+			rdims[ip][idim] = count;
+			rsize[ip] *= count;
+		}
+		rbuf_size += rsize[ip];
+
+		begb = ifstb;
+		if (ip > 0 || period) begb -= overlap[0];
+		strd = 1;
+		for (idim = 0; idim < dim_dest - 1; idim++) {
+			if (idim == dim_source - 1) {
+				strd *= blklen[idim];
+			}
+			else {
+				strd *= stride[idim];
+			}
+		}
+		sbeg[ip] = strd*(begb - 1);
+
+		rbeg[ip] = 0;
+		if (iproc > 0 || period) rbeg[ip] -= overlap[0] * strd;
+
+		strd = 1;
+		for (idim = 0; idim < dim_source - 1; idim++) {
+			if (idim == dim_dest - 1) {
+				strd *= blklen[idim];
+			}
+			else {
+				strd *= stride[idim];
+			}
+		}
+		rbeg[ip] += strd*(ifsta - 1);
+		ifsta += lblks_source[ip];
+		ifstb += lblks_dest[ip];
+	}
+
+//
+// Define message buffers
+//
+	transp->mem_sbuf = (void*)get_plbuf(sbuf_size * fsize * sizeof(char),
+		&transp->buf_id[7]);
+	transp->mem_rbuf = (void*)get_plbuf(rbuf_size * fsize * sizeof(char),
+		&transp->buf_id[8]);
+
+	ssh = 0;
+	rsh = 0;
+	for (ip = 0; ip < nproc; ip++) {
+		sbuf[ip] = transp->sbuf[ip] = (void*)((char*)transp->mem_sbuf + ssh);
+		rbuf[ip] = transp->rbuf[ip] = (void*)((char*)transp->mem_rbuf + rsh);
+
+		ssh += ssize[ip] * fsize;
+		rsh += rsize[ip] * fsize;
+	}
+	transp->nproc = nproc;
+	transp->iproc = iproc;
+	transp->comm = comm;
+	transp->fsize = fsize;
+
+	transp->ndims = ndims;
+	memcpy(transp->sstride, stride, ndims * sizeof(int));
+	memcpy(transp->rstride, stride, ndims * sizeof(int));
+	
+	// modifying send-recv strides to take into account arr_src != arr_dest
+	for (idim = 0; idim < ndims; idim++) {
+		if (idim == dim_source - 1) {
+			transp->sstride[idim] = blklen[idim];
+		}
+		if (idim == dim_dest - 1) {
+			transp->rstride[idim] = blklen[idim];
+		}
+	}
+
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_mp_start ( arr_source, arr_dest, transp )
+	void *arr_source, *arr_dest;
+	Transposition *transp;
+{
+	char *arr_source_ch = (char *) arr_source;
+	char *arr_dest_ch = (char *) arr_dest;
+
+	int ip;
+	char *src, *dest;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+//
+// Start the communication
+//
+	for (ip = 0; ip < transp->nproc; ip++) {
+		dest = arr_dest_ch + transp->rbeg[ip] * transp->fsize;
+		src = arr_source_ch + transp->sbeg[ip] * transp->fsize;
+		if ((dest == src) && (transp->iproc == ip)) {
+			transp->req[ip] = MPI_REQUEST_NULL;
+			transp->req[transp->nproc + ip] = MPI_REQUEST_NULL;
+		}
+		else {
+			MPI_Irecv(transp->rbuf[ip], transp->rsize[ip], transp->rtype[ip],
+				ip, 0, transp->comm, 
+				&transp->req[transp->nproc + ip]);
+
+			copy_to_buffer((char*)transp->sbuf[ip], src,
+				transp->ndims, transp->sdims[ip], transp->sstride, transp->fsize);
+
+			MPI_Isend(transp->sbuf[ip], transp->ssize[ip], transp->stype[ip],
+				ip, 0, transp->comm, 
+				&transp->req[ip]);
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_mp_end ( arr_source, arr_dest, transp )
+	void *arr_source, *arr_dest;
+	Transposition *transp;
+{
+	char *arr_source_ch = (char *)arr_source;
+	char *arr_dest_ch = (char *)arr_dest;
+
+	int ip;
+	char *src, *dest;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+	MPI_Waitall(transp->nproc, &transp->req[transp->nproc], MPI_STATUSES_IGNORE);
+
+	for (ip = 0; ip < transp->nproc; ip++) {
+		dest = arr_dest_ch + transp->rbeg[ip] * transp->fsize;
+		src = arr_source_ch + transp->sbeg[ip] * transp->fsize;
+		if ((dest == src) && (transp->iproc == ip)) {
+			continue;
+		}
+		else {
+			copy_from_buffer(dest, (char*)transp->rbuf[ip],
+				transp->ndims, transp->rdims[ip], transp->rstride, transp->fsize);
+		}
+	}
+
+	MPI_Waitall(transp->nproc, transp->req, MPI_STATUSES_IGNORE);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_mp_free ( transp )
+	Transposition *transp;
+{
+	int ip;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+	free_plbuf(transp->stype, transp->buf_id[0]);
+	free_plbuf(transp->sbeg, transp->buf_id[1]);
+	free_plbuf(transp->req, transp->buf_id[2]);
+
+
+	free_plbuf(transp->sdims, transp->buf_id[3]);
+	free_plbuf(transp->ssize, transp->buf_id[4]);
+	free_plbuf(transp->sbuf, transp->buf_id[5]);
+
+
+	free_plbuf(transp->mem_dims, transp->buf_id[6]);
+	free_plbuf(transp->mem_sbuf, transp->buf_id[7]);
+	free_plbuf(transp->mem_rbuf, transp->buf_id[8]);
+
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_mp ( ndims, arr_source, dim_source, lblks_source, arr_dest, 
+	dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+	period )
+	
+	void *arr_source, *arr_dest;
+	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
+	int *blklen, *overlap;
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+{
+	Transposition transp;
+	int ierr;
+	if (ierr = P_Transpose_mp_init(ndims, dim_source, lblks_source,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm,
+		period, &transp) != 0)
+	{
+		return ierr;
+	}
+	P_Transpose_mp_start(arr_source, arr_dest, &transp);
+	P_Transpose_mp_end(arr_source, arr_dest, &transp);
+	P_Transpose_mp_free(&transp);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+
+// v.1.95 - persistent exchanges for manual packing //
+// -------------------------------------------------------------------------- //
+int PST_Transpose_mp_init ( ndims, arr_source, dim_source, lblks_source, 
+	arr_dest, dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, period, 
+	transp )
+	
+	void *arr_source, *arr_dest;
+	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
+	int *blklen, *overlap;
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	Transposition *transp;
+{
+	char *arr_source_ch = (char *)arr_source;
+	char *arr_dest_ch = (char *)arr_dest;
+
+	int ip;
+	char *src, *dest;
+
+	int ierr = P_Transpose_mp_init(ndims, dim_source, lblks_source, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period, transp);
+	if (ierr != 0) return ierr;
+
+	transp->psrc = arr_source;
+	transp->pdest = arr_dest;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+//
+// Setup the communication
+//
+	for (ip = 0; ip < transp->nproc; ip++) {
+		dest = arr_dest_ch + transp->rbeg[ip] * transp->fsize;
+		src = arr_source_ch + transp->sbeg[ip] * transp->fsize;
+		if ((dest == src) && (transp->iproc == ip)) {
+			continue;
+		}
+		else {
+			MPI_Recv_init(transp->rbuf[ip], transp->rsize[ip], transp->rtype[ip],
+				ip, 0, transp->comm, 
+				&transp->req[transp->nproc + ip]);
+
+			MPI_Send_init(transp->sbuf[ip], transp->ssize[ip], transp->stype[ip],
+				ip, 0, transp->comm, 
+				&transp->req[ip]);
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_Transpose_mp_start ( arr_source, arr_dest, transp )
+	void *arr_source, *arr_dest;
+	Transposition *transp;
+{
+	char *arr_source_ch = (char *) arr_source;
+	char *arr_dest_ch = (char *) arr_dest;
+
+	int ip;
+	char *src, *dest;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+//
+// Start the communication
+//
+	for (ip = 0; ip < transp->nproc; ip++) {
+		dest = arr_dest_ch + transp->rbeg[ip] * transp->fsize;
+		src = arr_source_ch + transp->sbeg[ip] * transp->fsize;
+		if ((dest == src) && (transp->iproc == ip)) {
+			transp->req[ip] = MPI_REQUEST_NULL;
+			transp->req[transp->nproc + ip] = MPI_REQUEST_NULL;
+		}
+		else {
+			MPI_Start(&transp->req[transp->nproc + ip]);
+
+			copy_to_buffer((char*)transp->sbuf[ip], src,
+				transp->ndims, transp->sdims[ip], transp->sstride, transp->fsize);
+
+			MPI_Start(&transp->req[ip]);
+		}
+	}
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_Transpose_mp_end ( arr_source, arr_dest, transp )
+	void *arr_source, *arr_dest;
+	Transposition *transp;
+{
+	return P_Transpose_mp_end(arr_source, arr_dest, transp);
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_Transpose_mp_free ( transp )
+	Transposition *transp;
+{
+	char *arr_source_ch = (char *)transp->psrc;
+	char *arr_dest_ch = (char *)transp->pdest;
+
+	int ip;
+	char *src, *dest;
+
+	if (transp->nproc == 0) { return 0; }
+	if (transp->iproc == MPI_UNDEFINED) { return 0; }
+
+	// we have to free persistent requests first...
+	for (ip = 0; ip < transp->nproc; ip++) {
+		dest = arr_dest_ch + transp->rbeg[ip] * transp->fsize;
+		src = arr_source_ch + transp->sbeg[ip] * transp->fsize;
+		if ((dest == src) && (transp->iproc == ip)) {
+			continue;
+		}
+		else {
+			MPI_Request_free(&transp->req[transp->nproc + ip]);
+			MPI_Request_free(&transp->req[ip]);
+		}
+	}
+
+	return P_Transpose_mp_free(transp);
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int PST_Transpose_mp ( ndims, arr_source, dim_source, lblks_source, arr_dest, 
+	dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+	period )
+	
+	void *arr_source, *arr_dest;
+	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
+	int *blklen, *overlap;
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+{
+	Transposition transp;
+	int ierr;
+	if (ierr = PST_Transpose_mp_init(ndims, arr_source, dim_source, lblks_source, 
+		arr_dest, dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm,
+		period, &transp) != 0)
+	{
+		return ierr;
+	}
+	PST_Transpose_mp_start(arr_source, arr_dest, &transp);
+	PST_Transpose_mp_end(arr_source, arr_dest, &transp);
+	PST_Transpose_mp_free(&transp);
+	return 0;
+}
+// -------------------------------------------------------------------------- //
+
+
+// v.1.95 - choice subroutines //
+// -------------------------------------------------------------------------- //
+int P_Transpose_opt_init ( ndims, arr_source, dim_source, lblks_source, 
+	arr_dest, dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, period,
+	transp, exch_mode )
+	
+	void *arr_source, *arr_dest;
+	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
+	int *blklen, *overlap;
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	Transposition *transp;
+	int exch_mode;
+{
+	if (exch_mode == IS_MPI_TYPED) {
+		return P_Transpose_init(ndims, dim_source, lblks_source,
+			dim_dest, lblks_dest, stride, blklen, overlap, 
+			datatype, comm, period, transp);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK) {
+		return P_Transpose_mp_init(ndims, dim_source, lblks_source,
+			dim_dest, lblks_dest, stride, blklen, overlap,
+			datatype, comm, period, transp);
+	}
+	if (exch_mode == IS_MPI_TYPED_PERSISTENT) {
+		return PST_Transpose_init(ndims, arr_source, dim_source, lblks_source,
+			arr_dest, dim_dest, lblks_dest, stride, blklen, overlap,
+			datatype, comm, period, transp);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK_PERSISTENT) {
+		return PST_Transpose_mp_init(ndims, arr_source, dim_source, lblks_source,
+			arr_dest, dim_dest, lblks_dest, stride, blklen, overlap,
+			datatype, comm, period, transp);
+	}
+
+	return 999;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_opt_start ( arr_source, arr_dest, transp, exch_mode )
+	void *arr_source, *arr_dest;
+	Transposition *transp;
+	int exch_mode;
+{
+	if (exch_mode == IS_MPI_TYPED) {
+		return P_Transpose_start(arr_source, arr_dest, transp);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK) {
+		return P_Transpose_mp_start(arr_source, arr_dest, transp);
+	}
+	if (exch_mode == IS_MPI_TYPED_PERSISTENT) {
+		return PST_Transpose_start(transp);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK_PERSISTENT) {
+		return PST_Transpose_mp_start(arr_source, arr_dest, transp);
+	}
+
+	return 999;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_opt_end ( arr_source, arr_dest, transp, exch_mode )
+	void *arr_source, *arr_dest;
+	Transposition *transp;
+	int exch_mode;
+{
+	if (exch_mode == IS_MPI_TYPED) {
+		return P_Transpose_end(transp);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK) {
+		return P_Transpose_mp_end(arr_source, arr_dest, transp);
+	}
+	if (exch_mode == IS_MPI_TYPED_PERSISTENT) {
+		return PST_Transpose_end(transp);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK_PERSISTENT) {
+		return PST_Transpose_mp_end(arr_source, arr_dest, transp);
+	}
+
+	return 999;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_opt_free ( transp, exch_mode )
+	Transposition *transp;
+	int exch_mode;
+{
+	if (exch_mode == IS_MPI_TYPED) {
+		return P_Transpose_free(transp);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK) {
+		return P_Transpose_mp_free(transp);
+	}
+	if (exch_mode == IS_MPI_TYPED_PERSISTENT) {
+		return PST_Transpose_free(transp);
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK_PERSISTENT) {
+		return PST_Transpose_mp_free(transp);
+	}
+
+	return 999;
+}
+// -------------------------------------------------------------------------- //
+
+// -------------------------------------------------------------------------- //
+int P_Transpose_opt ( ndims, arr_source, dim_source, lblks_source, arr_dest, 
+	dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+	period, exch_mode )
+	
+	void *arr_source, *arr_dest;
+	int ndims, dim_source, *lblks_source, dim_dest, *lblks_dest, *stride;
+	int *blklen, *overlap;
+	MPI_Datatype datatype;
+	MPI_Comm comm;
+	int period;
+	int exch_mode;
+{
+	Transposition transp;
+	int ierr;
+
+	if (exch_mode == IS_MPI_TYPED) {
+		if (ierr = P_Transpose_init(ndims, dim_source, lblks_source,
+			dim_dest, lblks_dest, stride, blklen, overlap,
+			datatype, comm, period, &transp) != 0) 
+		{
+			return ierr;
+		}
+
+		P_Transpose_start(arr_source, arr_dest, &transp);
+		P_Transpose_end(&transp);
+		P_Transpose_free(&transp);
+		return 0;
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK) {
+		if (ierr = P_Transpose_mp_init(ndims, dim_source, lblks_source,
+			dim_dest, lblks_dest, stride, blklen, overlap,
+			datatype, comm, period, &transp) != 0) 
+		{
+			return ierr;
+		}
+
+		P_Transpose_mp_start(arr_source, arr_dest, &transp);
+		P_Transpose_mp_end(arr_source, arr_dest, &transp);
+		P_Transpose_mp_free(&transp);
+		return 0;
+	}
+	if (exch_mode == IS_MPI_TYPED_PERSISTENT) {
+		if (ierr = PST_Transpose_init(ndims, arr_source, dim_source, lblks_source,
+			arr_dest, dim_dest, lblks_dest, stride, blklen, overlap,
+			datatype, comm, period, &transp) != 0) 
+		{
+			return ierr;
+		}
+
+		PST_Transpose_start(&transp);
+		PST_Transpose_end(&transp);
+		PST_Transpose_free(&transp);
+		return 0;
+	}
+	if (exch_mode == IS_MPI_MANUAL_PACK_PERSISTENT) {
+		if (ierr = PST_Transpose_mp_init(ndims, arr_source, dim_source, lblks_source,
+			arr_dest, dim_dest, lblks_dest, stride, blklen, overlap,
+			datatype, comm, period, &transp) != 0) 
+		{
+			return ierr;
+		}
+
+		PST_Transpose_mp_start(arr_source, arr_dest, &transp);
+		PST_Transpose_mp_end(arr_source, arr_dest, &transp);
+		PST_Transpose_mp_free(&transp);
+		return 0;
+	}
+
+	return 999;
+}
+// -------------------------------------------------------------------------- //
\ No newline at end of file
diff --git a/ParLib.src/transposef.c b/ParLib.src/transposef.c
index b45cc0bcd6be94c67d14a3a0020b4f9c86bfbf8b..dc6c42a471fa3ebb5443333de5a05eef35daa458 100644
--- a/ParLib.src/transposef.c
+++ b/ParLib.src/transposef.c
@@ -1,6 +1,8 @@
 #include <stdlib.h>
 #include "parlib.h"
 
+
+// -------------------------------------------------------------------------- //
 #ifdef FORTRANUNDERSCORE
 void p_transpose_init_ ( ndims, dim_source, lblks_source, dim_dest, 
 		lblks_dest, stride, blklen, overlap, datatype, comm, period,
@@ -14,15 +16,16 @@ void p_transpose_init ( ndims, dim_source, lblks_source, dim_dest,
 		lblks_dest, stride, blklen, overlap, datatype, comm, period,
 		transp, ierr )
 #endif
+
 	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest; 
 	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
-   MPI_Fint **transp, *ierr;
+	MPI_Fint **transp, *ierr;
 {
-	*transp = (MPI_Fint *) malloc ( sizeof(Transposition) );
-	*ierr = P_Transpose_init ( *ndims, *dim_source, lblks_source, 
-		*dim_dest, lblks_dest, stride, blklen, overlap, 
+	*transp = (MPI_Fint *)malloc(sizeof(Transposition));
+	*ierr = P_Transpose_init(*ndims, *dim_source, lblks_source,
+		*dim_dest, lblks_dest, stride, blklen, overlap,
 		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period,
-		(Transposition *) *transp );
+		(Transposition *)*transp);
 }
 
 #ifdef FORTRANUNDERSCORE
@@ -32,11 +35,12 @@ void p_transpose_start__ ( arr_source, arr_dest, transp, ierr )
 #else
 void p_transpose_start ( arr_source, arr_dest, transp, ierr )
 #endif
+
 	void *arr_source, *arr_dest;
 	MPI_Fint **transp, *ierr;
 {
-   *ierr = P_Transpose_start ( arr_source, arr_dest, 
-		(Transposition *) *transp );
+	*ierr = P_Transpose_start(arr_source, arr_dest,
+		(Transposition *)*transp);
 }
 
 #ifdef FORTRANUNDERSCORE
@@ -46,9 +50,10 @@ void p_transpose_end__ ( transp, ierr )
 #else
 void p_transpose_end ( transp, ierr )
 #endif
+
 	MPI_Fint **transp, *ierr;
 {
-	*ierr = P_Transpose_end ( (Transposition *) *transp );
+	*ierr = P_Transpose_end((Transposition *)*transp);
 }
 
 #ifdef FORTRANUNDERSCORE
@@ -58,10 +63,11 @@ void p_transpose_free__ ( transp, ierr )
 #else
 void p_transpose_free ( transp, ierr )
 #endif
+	
 	MPI_Fint **transp, *ierr;
 {
-	*ierr = P_Transpose_free ( (Transposition *) *transp );
-	free ( *transp ); 
+	*ierr = P_Transpose_free((Transposition *)*transp);
+	free(*transp);
 }
 
 #ifdef FORTRANUNDERSCORE
@@ -77,12 +83,514 @@ void p_transpose ( ndims, arr_source, dim_source, lblks_source, arr_dest,
 		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
 		period, ierr )
 #endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest; 
+	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
+	MPI_Fint *ierr;
+{
+	*ierr = P_Transpose(*ndims, arr_source, *dim_source, lblks_source,
+		arr_dest, *dim_dest, lblks_dest, stride, blklen, overlap,
+		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period);
+}
+// -------------------------------------------------------------------------- //
+
+// v.1.95 - persistent exchanges //
+// -------------------------------------------------------------------------- //
+#ifdef FORTRANUNDERSCORE
+void pst_transpose_init_( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_transpose_init__( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, ierr )
+#else
+void pst_transpose_init ( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest; 
+	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
+	MPI_Fint **transp, *ierr;
+{
+	*transp = (MPI_Fint *)malloc(sizeof(Transposition));
+	*ierr = PST_Transpose_init(*ndims, arr_source, *dim_source, lblks_source, arr_dest,
+		*dim_dest, lblks_dest, stride, blklen, overlap,
+		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period,
+		(Transposition *)*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_transpose_start_ ( transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_transpose_start__ ( transp, ierr )
+#else
+void pst_transpose_start ( transp, ierr )
+#endif
+
+	MPI_Fint **transp, *ierr;
+{
+	*ierr = PST_Transpose_start((Transposition *)*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_transpose_end_ ( transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_transpose_end__ ( transp, ierr )
+#else
+void pst_transpose_end ( transp, ierr )
+#endif
+
+	MPI_Fint **transp, *ierr;
+{
+	*ierr = PST_Transpose_end((Transposition *)*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_transpose_free_ ( transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_transpose_free__ ( transp, ierr )
+#else
+void pst_transpose_free ( transp, ierr )
+#endif
+	
+	MPI_Fint **transp, *ierr;
+{
+	*ierr = PST_Transpose_free((Transposition *)*transp);
+	free(*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_transpose_ ( ndims, arr_source, dim_source, lblks_source, arr_dest,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_transpose__ ( ndims, arr_source, dim_source, lblks_source, arr_dest,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, ierr )
+#else
+void pst_transpose ( ndims, arr_source, dim_source, lblks_source, arr_dest, 
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, ierr )
+#endif
+
 	void *arr_source, *arr_dest;
 	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest; 
 	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
 	MPI_Fint *ierr;
 {
-	*ierr = P_Transpose ( *ndims, arr_source, *dim_source, lblks_source, 
-		arr_dest, *dim_dest, lblks_dest, stride, blklen, overlap, 
-		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period );
+	*ierr = PST_Transpose(*ndims, arr_source, *dim_source, lblks_source,
+		arr_dest, *dim_dest, lblks_dest, stride, blklen, overlap,
+		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period);
+}
+// -------------------------------------------------------------------------- //
+
+// v.1.7 - manual packing //
+// -------------------------------------------------------------------------- //
+#ifdef FORTRANUNDERSCORE
+void p_transpose_mp_init_ ( ndims, dim_source, lblks_source, dim_dest, 
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_transpose_mp_init__ ( ndims, dim_source, lblks_source, dim_dest, 
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, ierr )
+#else
+void p_transpose_mp_init ( ndims, dim_source, lblks_source, dim_dest, 
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, ierr )
+#endif
+
+	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest;
+	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
+	MPI_Fint **transp, *ierr;
+{
+	*transp = (MPI_Fint *)malloc(sizeof(Transposition));
+	*ierr = P_Transpose_mp_init(*ndims, *dim_source, lblks_source,
+		*dim_dest, lblks_dest, stride, blklen, overlap,
+		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period,
+		(Transposition *)*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_transpose_mp_start_ ( arr_source, arr_dest, transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_transpose_mp_start__ ( arr_source, arr_dest, transp, ierr )
+#else
+void p_transpose_mp_start ( arr_source, arr_dest, transp, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint **transp, *ierr;
+{
+	*ierr = P_Transpose_mp_start(arr_source, arr_dest,
+		(Transposition *)*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_transpose_mp_end_ ( arr_source, arr_dest, transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_transpose_mp_end__ ( arr_source, arr_dest, transp, ierr )
+#else
+void p_transpose_mp_end ( arr_source, arr_dest, transp, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint **transp, *ierr;
+{
+	*ierr = P_Transpose_mp_end(arr_source, arr_dest, (Transposition *)*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_transpose_mp_free_ ( transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_transpose_mp_free__ ( transp, ierr )
+#else
+void p_transpose_mp_free ( transp, ierr )
+#endif
+	
+	MPI_Fint **transp, *ierr;
+{
+	*ierr = P_Transpose_mp_free((Transposition *)*transp);
+	free(*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_transpose_mp_ ( ndims, arr_source, dim_source, lblks_source, arr_dest,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_transpose_mp__ ( ndims, arr_source, dim_source, lblks_source, arr_dest,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, ierr )
+#else
+void p_transpose_mp ( ndims, arr_source, dim_source, lblks_source, arr_dest, 
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, ierr )
+#endif
+	
+	void *arr_source, *arr_dest;
+	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest;
+	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
+	MPI_Fint *ierr;
+{
+	*ierr = P_Transpose_mp(*ndims, arr_source, *dim_source, lblks_source,
+		arr_dest, *dim_dest, lblks_dest, stride, blklen, overlap,
+		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period);
+}
+// -------------------------------------------------------------------------- //
+
+
+// v.1.95 - persistent exchanges for manual packing //
+// -------------------------------------------------------------------------- //
+#ifdef FORTRANUNDERSCORE
+void pst_transpose_mp_init_( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_transpose_mp_init__( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, ierr )
+#else
+void pst_transpose_mp_init ( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest; 
+	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
+	MPI_Fint **transp, *ierr;
+{
+	*transp = (MPI_Fint *)malloc(sizeof(Transposition));
+	*ierr = PST_Transpose_mp_init(*ndims, arr_source, *dim_source, lblks_source, arr_dest,
+		*dim_dest, lblks_dest, stride, blklen, overlap,
+		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period,
+		(Transposition *)*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_transpose_mp_start_ ( arr_source, arr_dest, transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_transpose_mp_start__ ( arr_source, arr_dest, transp, ierr )
+#else
+void pst_transpose_mp_start ( arr_source, arr_dest, transp, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint **transp, *ierr;
+{
+	*ierr = PST_Transpose_mp_start(arr_source, arr_dest,
+		(Transposition *)*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_transpose_mp_end_ ( arr_source, arr_dest, transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_transpose_mp_end__ ( arr_source, arr_dest, transp, ierr )
+#else
+void pst_transpose_mp_end ( arr_source, arr_dest, transp, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint **transp, *ierr;
+{
+	*ierr = PST_Transpose_mp_end(arr_source, arr_dest, (Transposition *)*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_transpose_mp_free_ ( transp, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_transpose_mp_free__ ( transp, ierr )
+#else
+void pst_transpose_mp_free ( transp, ierr )
+#endif
+	
+	MPI_Fint **transp, *ierr;
+{
+	*ierr = PST_Transpose_mp_free((Transposition *)*transp);
+	free(*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void pst_transpose_mp_ ( ndims, arr_source, dim_source, lblks_source, arr_dest,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void pst_transpose_mp__ ( ndims, arr_source, dim_source, lblks_source, arr_dest,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, ierr )
+#else
+void pst_transpose_mp ( ndims, arr_source, dim_source, lblks_source, arr_dest, 
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, ierr )
+#endif
+	
+	void *arr_source, *arr_dest;
+	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest;
+	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
+	MPI_Fint *ierr;
+{
+	*ierr = PST_Transpose_mp(*ndims, arr_source, *dim_source, lblks_source,
+		arr_dest, *dim_dest, lblks_dest, stride, blklen, overlap,
+		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period);
+}
+// -------------------------------------------------------------------------- //
+
+
+// v.1.95 - choice subroutines //
+// -------------------------------------------------------------------------- //
+#ifdef FORTRANUNDERSCORE
+void p_transpose_opt_init_( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_transpose_opt_init__( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, exch_mode, ierr)
+#else
+void p_transpose_opt_init ( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		transp, exch_mode, ierr)
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest; 
+	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
+	MPI_Fint **transp, *ierr, *exch_mode;
+{
+	*transp = (MPI_Fint *)malloc(sizeof(Transposition));
+	*ierr = P_Transpose_opt_init(*ndims, arr_source, *dim_source, lblks_source, arr_dest,
+		*dim_dest, lblks_dest, stride, blklen, overlap,
+		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period,
+		(Transposition *)*transp, *exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_transpose_opt_start_ ( arr_source, arr_dest, transp, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_transpose_opt_start__ ( arr_source, arr_dest, transp, exch_mode, ierr )
+#else
+void p_transpose_opt_start ( arr_source, arr_dest, transp, exch_mode, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint **transp, *ierr, *exch_mode;
+{
+	*ierr = P_Transpose_opt_start(arr_source, arr_dest,
+		(Transposition *)*transp, *exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_transpose_opt_end_ ( arr_source, arr_dest, transp, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_transpose_opt_end__ ( arr_source, arr_dest, transp, exch_mode, ierr )
+#else
+void p_transpose_opt_end ( arr_source, arr_dest, transp, exch_mode, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint **transp, *ierr, *exch_mode;
+{
+	*ierr = P_Transpose_opt_end(arr_source, arr_dest, 
+		(Transposition *)*transp, *exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_transpose_opt_free_ ( transp, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_transpose_opt_free__ ( transp, exch_mode, ierr )
+#else
+void p_transpose_opt_free ( transp, exch_mode, ierr )
+#endif
+	
+	MPI_Fint **transp, *ierr, *exch_mode;
+{
+	*ierr = P_Transpose_opt_free((Transposition *)*transp, *exch_mode);
+	free(*transp);
+}
+
+#ifdef FORTRANUNDERSCORE
+void p_transpose_opt_ ( ndims, arr_source, dim_source, lblks_source, arr_dest,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, exch_mode, ierr)
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void p_transpose_opt__ (ndims, arr_source, dim_source, lblks_source, arr_dest,
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, exch_mode, ierr)
+#else
+void p_transpose_opt ( ndims, arr_source, dim_source, lblks_source, arr_dest, 
+		dim_dest, lblks_dest, stride, blklen, overlap, datatype, comm, 
+		period, exch_mode, ierr)
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest; 
+	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
+	MPI_Fint *ierr, *exch_mode;
+{
+	*ierr = P_Transpose_opt(*ndims, arr_source, *dim_source, lblks_source,
+		arr_dest, *dim_dest, lblks_dest, stride, blklen, overlap,
+		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period, *exch_mode);
+}
+// -------------------------------------------------------------------------- //
+
+
+// v.2.0 - regular communications [removed only on correct program exit] //
+// -------------------------------------------------------------------------- //
+#ifdef FORTRANUNDERSCORE
+void reg_transpose_( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		exch_id, exch_mode, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void reg_transpose__( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		exch_id, exch_mode, ierr)
+#else
+void reg_transpose ( ndims, arr_source, dim_source, lblks_source, arr_dest, dim_dest,
+		lblks_dest, stride, blklen, overlap, datatype, comm, period,
+		exch_id, exch_mode, ierr)
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint *ndims, *dim_source, *lblks_source, *dim_dest, *lblks_dest; 
+	MPI_Fint *stride, *blklen, *overlap, *datatype, *comm, *period;
+	MPI_Fint *ierr, *exch_id, *exch_mode;
+{
+	Transposition *transp;
+	transp = (Transposition *)malloc(sizeof(Transposition));
+
+	*ierr = P_Transpose_opt_init(*ndims, arr_source, *dim_source, lblks_source, arr_dest,
+		*dim_dest, lblks_dest, stride, blklen, overlap,
+		MPI_Type_f2c(*datatype), MPI_Comm_f2c(*comm), *period,
+		transp, *exch_mode);
+	if ((int)*ierr != 0) return;
+
+	*exch_id = save_transp_handle(transp, (int)*exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void start_transpose_ ( arr_source, arr_dest, exch_id, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void start_transpose__ ( arr_source, arr_dest, exch_id, ierr )
+#else
+void start_transpose ( arr_source, arr_dest, exch_id, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint *ierr, *exch_id;
+{
+	int exch_mode;
+	Transposition *transp;
+	
+	get_transp_handle(&transp, &exch_mode, (int)*exch_id);
+		
+	*ierr = P_Transpose_opt_start(arr_source, arr_dest, transp, exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void end_transpose_ ( arr_source, arr_dest, exch_id, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void end_transpose__ ( arr_source, arr_dest, exch_id, ierr )
+#else
+void end_transpose ( arr_source, arr_dest, exch_id, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint *ierr, *exch_id;
+{
+	int exch_mode;
+	Transposition *transp;
+	
+	get_transp_handle(&transp, &exch_mode, (int)*exch_id);
+
+	*ierr = P_Transpose_opt_end(arr_source, arr_dest, transp, exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void run_transpose_ ( arr_source, arr_dest, exch_id, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void run_transpose__ ( arr_source, arr_dest, exch_id, ierr )
+#else
+void run_transpose ( arr_source, arr_dest, exch_id, ierr )
+#endif
+
+	void *arr_source, *arr_dest;
+	MPI_Fint *ierr, *exch_id;
+{
+	int exch_mode;
+	Transposition *transp;
+
+	get_transp_handle(&transp, &exch_mode, (int)*exch_id);
+
+	*ierr = P_Transpose_opt_start(arr_source, arr_dest, transp, exch_mode);
+	if ((int)*ierr != 0) return;
+
+	*ierr = P_Transpose_opt_end(arr_source, arr_dest, transp, exch_mode);
+}
+
+#ifdef FORTRANUNDERSCORE
+void unreg_transpose_ ( exch_id, ierr )
+#elif defined(FORTRANDOUBLEUNDERSCORE)
+void unreg_transpose__ ( exch_id, ierr )
+#else
+void unreg_transpose ( exch_id, ierr )
+#endif
+
+	MPI_Fint *ierr, *exch_id;
+{
+	int exch_mode;
+	Transposition *transp;
+
+	get_transp_handle(&transp, &exch_mode, (int)*exch_id);
+
+	*ierr = P_Transpose_opt_free(transp, exch_mode);
+	if ((int)*ierr != 0) return;
+	free(transp);
+
+	remove_transp_handle((int)*exch_id);
 }
+// -------------------------------------------------------------------------- //