diff --git a/srcCU/sfx_compute_esm.cu b/srcCU/sfx_compute_esm.cu
index 302494a0f511f2d5205c4ab7bbd9129cc085acc8..354be330976dea575516ee81a927120262738773 100644
--- a/srcCU/sfx_compute_esm.cu
+++ b/srcCU/sfx_compute_esm.cu
@@ -1,8 +1,17 @@
-#include <cmath>
 #include <iostream>
 #include "../includeCU/sfx_compute_esm.cuh"
 #include "../includeCU/sfx_surface.cuh"
 
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
 template<typename T>
 __device__ void get_charnock_roughness(T &z0_m, T &u_dyn0,
     const T h, const T U,
@@ -404,6 +413,8 @@ void compute_flux_esm_gpu(T *zeta_, T *Rib_, T *Re_, T *B_, T *z0_m_, T *z0_t_,
     Pr_m, nu_air, g, 
     maxiters_charnock, maxiters_convection, 
     grid_size);
+
+    gpuErrchk( cudaPeekAtLastError() );
 }
 
 template void compute_flux_esm_gpu(float *zeta_, float *Rib_, float *Re_, float *B_, float *z0_m_, float *z0_t_, float *Rib_conv_lim_, float *Cm_, float *Ct_, float *Km_, float *Pr_t_inv_,
diff --git a/srcCU/sfx_compute_sheba.cu b/srcCU/sfx_compute_sheba.cu
index fd80dd51e56be4f09803bf52b6516ae5ab05913a..56a93bb49c28ac039c681e6ad71ffc1aa6f33f87 100644
--- a/srcCU/sfx_compute_sheba.cu
+++ b/srcCU/sfx_compute_sheba.cu
@@ -1,8 +1,17 @@
-#include <cmath>
 #include <iostream>
 #include "../includeCU/sfx_compute_sheba.cuh"
 #include "../includeCU/sfx_surface.cuh"
 
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
 template<typename T>
 __device__ void get_charnock_roughness(T &z0_m, T &u_dyn0,
     const T h, const T U,
@@ -367,17 +376,17 @@ __global__ void kernel_compute_flux_sheba(T *zeta_, T *Rib_, T *Re_, T *B_, T *z
         Km = kappa * Cm * U * h / phi_m;
         Pr_t_inv = phi_m / phi_h;
 
-        zeta_[index]         = 0.0;
-        Rib_[index]          = 0.0;
-        Re_[index]           = 0.0;
-        B_[index]            = 0.0;
-        z0_m_[index]         = 0.0;
-        z0_t_[index]         = 0.0;
+        zeta_[index]         = zeta;
+        Rib_[index]          = Rib;
+        Re_[index]           = Re;
+        B_[index]            = B;
+        z0_m_[index]         = z0_m;
+        z0_t_[index]         = z0_t;
         Rib_conv_lim_[index] = 0.0;
-        Cm_[index]           = 0.0;
-        Ct_[index]           = 0.0;
-        Km_[index]           = 0.0;
-        Pr_t_inv_[index]     = 0.0;
+        Cm_[index]           = Cm;
+        Ct_[index]           = Ct;
+        Km_[index]           = Km;
+        Pr_t_inv_[index]     = Pr_t_inv;
     }
 }
 
@@ -426,8 +435,8 @@ void compute_flux_sheba_gpu(T *zeta_, T *Rib_, T *Re_, T *B_, T *z0_m_, T *z0_t_
     const int maxiters_charnock,
     const int grid_size)
 {
-    const int BlockCount = int(ceil(float(grid_size) / 1024.0));
-    dim3 cuBlock = dim3(1024, 1, 1);
+    const int BlockCount = int(ceil(float(grid_size) / 512.0));
+    dim3 cuBlock = dim3(512, 1, 1);
 	dim3 cuGrid = dim3(BlockCount, 1, 1);
 
     kernel_compute_flux_sheba<<<cuGrid, cuBlock>>>(zeta_, Rib_, Re_, B_, z0_m_, z0_t_, Rib_conv_lim_, Cm_, Ct_, Km_, Pr_t_inv_,
@@ -444,6 +453,7 @@ void compute_flux_sheba_gpu(T *zeta_, T *Rib_, T *Re_, T *B_, T *z0_m_, T *z0_t_
     Pr_m, nu_air, g, 
     maxiters_charnock, 
     grid_size);
+    gpuErrchk( cudaPeekAtLastError() );
 }
 
 template void compute_flux_sheba_gpu(float *zeta_, float *Rib_, float *Re_, float *B_, float *z0_m_, float *z0_t_, float *Rib_conv_lim_, float *Cm_, float *Ct_, float *Km_, float *Pr_t_inv_,