#include "../include/matvec.hpp"
#include "MemoryProcessing.h"

#ifdef INCLUDE_CUDA
    #include "MemoryProcessing.cuh"
    #include "../include/matvec.cuh"
#endif

namespace icethermo
{
    template <typename NumType, MemType memtype>
    NumType sum_vec(const NumType* vec, const int size)
    {
        NumType accumulate_res = 0.0;
    
    #ifdef INCLUDE_CUDA
        if(memtype == MemType::GPU) 
        {
            NumType* cpu_holder;
            size_t cpu_holder_size = 0;

            memproc::realloc<MemType::CPU>((void *&)(cpu_holder), cpu_holder_size, size * sizeof(NumType));
            memproc::memcopy<MemType::CPU, MemType::GPU>(cpu_holder, vec, cpu_holder_size);

            for (int i = 0; i < size; i++)
            {
                accumulate_res += cpu_holder[i];  
            }

            memproc::dealloc<MemType::CPU>((void *&)(cpu_holder));

            return accumulate_res;   
        }
    #endif
    
        for (int i = 0; i < size; i++)
            accumulate_res += vec[i];  
        return accumulate_res;   
    }

    template <typename NumType, MemType memtype>
    NumType sum_vec(const NumType* vec, const int start, const int end)
    {
        NumType accumulate_res = 0.0;

    #ifdef INCLUDE_CUDA
        if(memtype == MemType::GPU) 
        {
            NumType* cpu_holder;
            size_t cpu_holder_size = 0;
            const int size = end - start;

            memproc::realloc<MemType::CPU>((void *&)(cpu_holder), cpu_holder_size, size * sizeof(NumType));
            memproc::memcopy<MemType::CPU, MemType::GPU>(cpu_holder, vec + start, cpu_holder_size);

            for (int i = start; i < end; i++)
                accumulate_res += cpu_holder[i];  

            memproc::dealloc<MemType::CPU>((void *&)(cpu_holder));

            return accumulate_res;   
        }
    #endif

        for (int i = start; i < end; i++)
            accumulate_res += vec[i];  
        return accumulate_res;   
    }

    // template <typename NumType, MemType memtype> 
    // NumType mul_acum_vec(const NumType* vec1, const NumType* vec2, const int size)
    // {
    //     NumType mul_acum_vec_res = 0.0;

    // #ifdef INCLUDE_CUDA
    //     if(memtype == MemType::GPU) 
    //     {
    //         NumType* res_holder;
    //         size_t res_holder_size = 0;

    //         memproc::realloc<MemType::GPU>((void *&)(res_holder), res_holder_size, size * sizeof(NumType));
    //         icethermo_gpu::mul_vec(vec1, vec2, size, res_holder);
    //         mul_acum_vec_res = sum_vec<NumType memtype>(res_holder, size);
    //         memproc::dealloc<MemType::GPU>((void *&)(res_holder));

    //         return mul_acum_vec_res;
    //     }
    // #endif

    //     for (int i = 0; i < size; i++)
    //         mul_acum_vec_res += vec1[i] * vec2[i];  
    //     return mul_acum_vec_res;   

    // }

    template <typename NumType, MemType memtype> 
    void mul_vec(NumType* vec, const NumType num, const int size)
    {
    #ifdef INCLUDE_CUDA
        if(memtype == MemType::GPU) 
        {
            icethermo_gpu::mul_vec(vec, num, size);
            return;
        }
    #endif

        for (int i = 0; i < size; i++)  
            vec[i] *= num;
    }


    // explicit instantaion
    template float sum_vec<float, MemType::CPU>(const float* vec, const int size);
    template double sum_vec<double, MemType::CPU>(const double* vec, const int size);

#ifdef INCLUDE_CUDA
    template float sum_vec<float, MemType::GPU>(const float* vec, const int size);
    template double sum_vec<double, MemType::GPU>(const double* vec, const int size);
#endif

    template float sum_vec<float, MemType::CPU>(const float* vec, const int start, const int end);
    template double sum_vec<double, MemType::CPU>(const double* vec, const int start, const int end);

#ifdef INCLUDE_CUDA
    template float sum_vec<float, MemType::GPU>(const float* vec, const int start, const int end);
    template double sum_vec<double, MemType::GPU>(const double* vec, const int start, const int end);
#endif

    template void mul_vec<float, MemType::CPU>(float* vec, const float num, const int size);
    template void mul_vec<double, MemType::CPU>(double* vec, const double num, const int size);

#ifdef INCLUDE_CUDA
    template void mul_vec<float, MemType::GPU>(float* vec, const float num, const int size);
    template void mul_vec<double, MemType::GPU>(double* vec, const double num, const int size);
#endif
}