thread count sheba reduced

a77e6e9a · 数学の武士 · abff347e · a77e6e9a · a77e6e9a
Commit a77e6e9a authored 1 year ago by 数学の武士
--- a/srcCU/sfx_compute_esm.cu
+++ b/srcCU/sfx_compute_esm.cu
-#include <cmath>
 #include <iostream>
 #include "../includeCU/sfx_compute_esm.cuh"
 #include "../includeCU/sfx_surface.cuh"
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
 template<typename T>
 __device__ void get_charnock_roughness(T &z0_m, T &u_dyn0,
    const T h, const T U,
@@ -404,6 +413,8 @@ void compute_flux_esm_gpu(T *zeta_, T *Rib_, T *Re_, T *B_, T *z0_m_, T *z0_t_,
    Pr_m, nu_air, g, 
    maxiters_charnock, maxiters_convection, 
    grid_size);
+    gpuErrchk( cudaPeekAtLastError() );
 }
 template void compute_flux_esm_gpu(float *zeta_, float *Rib_, float *Re_, float *B_, float *z0_m_, float *z0_t_, float *Rib_conv_lim_, float *Cm_, float *Ct_, float *Km_, float *Pr_t_inv_,

--- a/srcCU/sfx_compute_sheba.cu
+++ b/srcCU/sfx_compute_sheba.cu
-#include <cmath>
 #include <iostream>
 #include "../includeCU/sfx_compute_sheba.cuh"
 #include "../includeCU/sfx_surface.cuh"
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
 template<typename T>
 __device__ void get_charnock_roughness(T &z0_m, T &u_dyn0,
    const T h, const T U,
@@ -367,17 +376,17 @@ __global__ void kernel_compute_flux_sheba(T *zeta_, T *Rib_, T *Re_, T *B_, T *z
        Km = kappa * Cm * U * h / phi_m;
        Pr_t_inv = phi_m / phi_h;
-        zeta_[index]         = 0.0;
+        zeta_[index]         = zeta;
-        Rib_[index]          = 0.0;
+        Rib_[index]          = Rib;
-        Re_[index]           = 0.0;
+        Re_[index]           = Re;
-        B_[index]            = 0.0;
+        B_[index]            = B;
-        z0_m_[index]         = 0.0;
+        z0_m_[index]         = z0_m;
-        z0_t_[index]         = 0.0;
+        z0_t_[index]         = z0_t;
        Rib_conv_lim_[index] = 0.0;
-        Cm_[index]           = 0.0;
+        Cm_[index]           = Cm;
-        Ct_[index]           = 0.0;
+        Ct_[index]           = Ct;
-        Km_[index]           = 0.0;
+        Km_[index]           = Km;
-        Pr_t_inv_[index]     = 0.0;
+        Pr_t_inv_[index]     = Pr_t_inv;
    }
 }
@@ -426,8 +435,8 @@ void compute_flux_sheba_gpu(T *zeta_, T *Rib_, T *Re_, T *B_, T *z0_m_, T *z0_t_
    const int maxiters_charnock,
    const int grid_size)
 {
-    const int BlockCount = int(ceil(float(grid_size) / 1024.0));
+    const int BlockCount = int(ceil(float(grid_size) / 512.0));
-    dim3 cuBlock = dim3(1024, 1, 1);
+    dim3 cuBlock = dim3(512, 1, 1);
 	dim3 cuGrid = dim3(BlockCount, 1, 1);
    kernel_compute_flux_sheba<<<cuGrid, cuBlock>>>(zeta_, Rib_, Re_, B_, z0_m_, z0_t_, Rib_conv_lim_, Cm_, Ct_, Km_, Pr_t_inv_,
@@ -444,6 +453,7 @@ void compute_flux_sheba_gpu(T *zeta_, T *Rib_, T *Re_, T *B_, T *z0_m_, T *z0_t_
    Pr_m, nu_air, g, 
    maxiters_charnock, 
    grid_size);
+    gpuErrchk( cudaPeekAtLastError() );
 }
 template void compute_flux_sheba_gpu(float *zeta_, float *Rib_, float *Re_, float *B_, float *z0_m_, float *z0_t_, float *Rib_conv_lim_, float *Cm_, float *Ct_, float *Km_, float *Pr_t_inv_,