variability change to gpu code

palumbom · Nov 15, 2024 · 287fec6 · 287fec6
1 parent 5ebc758
commit 287fec6
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 55 deletions.
diff --git a/src/GRASS.jl b/src/GRASS.jl
@@ -36,7 +36,12 @@ import Polynomials: fit as pfit, coeffs
 import Base: AbstractArray as AA
 import Base: AbstractFloat as AF
 
+# configure directories
+include("config.jl")
+
+# get kernels for SPICE stuff
 include("get_kernels.jl")
+
 #set required body paramters as global variables 
 #E,S,M radii (units:km)
 earth_radius = bodvrd("EARTH", "RADII")[1]	
@@ -45,12 +50,12 @@ sun_radius = bodvrd("SUN","RADII")[1]
 moon_radius = bodvrd("MOON", "RADII")[1] 
 
 #collect LD info as global variables - (units: nm)
-quad_ld_coeff_SSD = CSV.read("data/quad_ld_coeff_SSD.csv", DataFrame)
-quad_ld_coeff_300 = CSV.read("data/quad_ld_coeff_300.csv", DataFrame)
-quad_ld_coeff_HD = CSV.read("data/quad_ld_coeff_HD.csv", DataFrame)
-quad_ld_coeff_NL94 = CSV.read("data/quad_ld_coeff_NL94.csv", DataFrame)
+quad_ld_coeff_SSD = CSV.read(joinpath(datdir, "quad_ld_coeff_SSD.csv"), DataFrame)
+quad_ld_coeff_300 = CSV.read(joinpath(datdir, "quad_ld_coeff_300.csv"), DataFrame)
+quad_ld_coeff_HD = CSV.read(joinpath(datdir, "quad_ld_coeff_HD.csv"), DataFrame)
+quad_ld_coeff_NL94 = CSV.read(joinpath(datdir, "quad_ld_coeff_NL94.csv"), DataFrame)
 
-NL94_coeff = CSV.read("data/NL94_coeff.csv", DataFrame)
+NL94_coeff = CSV.read(joinpath(datdir, "NL94_coeff.csv"), DataFrame)
 lambda_nm = NL94_coeff.lambda_nm
 a0 = NL94_coeff.a0
 a1 = NL94_coeff.a1
@@ -59,10 +64,7 @@ a3 = NL94_coeff.a3
 a4 = NL94_coeff.a4
 a5 = NL94_coeff.a5
 
-extinction_coeff = DataFrame(CSV.File("data/NEID_three_extinction.csv"))
-
-# configure directories
-include("config.jl")
+extinction_coeff = DataFrame(CSV.File(joinpath(datdir, "NEID_three_extinction.csv")))
 
 # ancillary functions + constants
 include("utils.jl")

diff --git a/src/get_kernels.jl b/src/get_kernels.jl
@@ -7,35 +7,28 @@ const BPC = "https://naif.jpl.nasa.gov/pub/naif/generic_kernels/pck/moon_pa_de44
 const EARTH_BPC = "https://naif.jpl.nasa.gov/pub/naif/generic_kernels/pck/earth_latest_high_prec.bpc"
 const EARTH_default = "https://naif.jpl.nasa.gov/pub/naif/generic_kernels/fk/planets/earth_assoc_itrf93.tf"
 const TPC = "https://naif.jpl.nasa.gov/pub/naif/generic_kernels/pck/pck00010.tpc"
-const moddir = abspath(joinpath(@__DIR__, ".."))
-const datdir = joinpath(moddir, "data/")
-
-if !isdir(datdir)
-    mkdir(datdir)
-end
 
 # function to download and load kernels
 function get_kernels()
     # Download kernels
-    if !isfile(joinpath(datdir, "de440.bsp")); download(SPK, datdir * "de440.bsp"); end
-    if !isfile(datdir * "naif0012.tls"); download(LSK, datdir * "naif0012.tls"); end
-    if !isfile(datdir * "pck00010.tpc"); download(TPC, datdir * "pck00010.tpc"); end
-    if !isfile(datdir * "jup365.bsp"); download(SPK_JUP, datdir * "jup365.bsp"); end
-    if !isfile(datdir * "moon_pa_de440_200625.bpc"); download(BPC, datdir * "moon_pa_de440_200625.bpc"); end
-    if !isfile(datdir * "earth_latest_high_prec.bpc"); download(EARTH_BPC, datdir * "earth_latest_high_prec.bpc"); end
-    if !isfile(datdir * "earth_assoc_itrf93.tf"); download(EARTH_default, datdir * "earth_assoc_itrf93.tf"); end
-
+    if !isfile(joinpath(datdir, "de440.bsp")); download(SPK, joinpath(datdir, "de440.bsp")); end
+    if !isfile(joinpath(datdir, "naif0012.tls")); download(LSK, joinpath(datdir, "naif0012.tls")); end
+    if !isfile(joinpath(datdir, "pck00010.tpc")); download(TPC, joinpath(datdir, "pck00010.tpc")); end
+    if !isfile(joinpath(datdir, "jup365.bsp")); download(SPK_JUP, joinpath(datdir, "jup365.bsp")); end
+    if !isfile(joinpath(datdir, "moon_pa_de440_200625.bpc")); download(BPC, joinpath(datdir, "moon_pa_de440_200625.bpc")); end
+    if !isfile(joinpath(datdir, "earth_latest_high_prec.bpc")); download(EARTH_BPC, joinpath(datdir, "earth_latest_high_prec.bpc")); end
+    if !isfile(joinpath(datdir, "earth_assoc_itrf93.tf")); download(EARTH_default, joinpath(datdir, "earth_assoc_itrf93.tf")); end
 
     # load into memory
-    furnsh(datdir * "naif0012.tls")
-    furnsh(datdir * "de440.bsp")
-    furnsh(datdir * "moon_pa_de440_200625.bpc")
-    furnsh(datdir * "pck00010.tpc")
-    furnsh(datdir * "earth_latest_high_prec.bpc")
-    furnsh(datdir * "earth_assoc_itrf93.tf")
-    furnsh(datdir * "jup365.bsp")
-    # furnsh(datdir * "eclipse_SPK.bsp")
-    furnsh(datdir * "ID_mapping.txt")
+    furnsh(joinpath(datdir, "naif0012.tls"))
+    furnsh(joinpath(datdir, "de440.bsp"))
+    furnsh(joinpath(datdir, "moon_pa_de440_200625.bpc"))
+    furnsh(joinpath(datdir, "pck00010.tpc"))
+    furnsh(joinpath(datdir, "earth_latest_high_prec.bpc"))
+    furnsh(joinpath(datdir, "earth_assoc_itrf93.tf"))
+    furnsh(joinpath(datdir, "jup365.bsp"))
+    # furnsh(joinpath(datdir, "eclipse_SPK.bsp"))
+    furnsh(joinpath(datdir, "ID_mapping.txt"))
 
     return nothing
 end

diff --git a/src/gpu/gpu_sim.jl b/src/gpu/gpu_sim.jl
@@ -51,13 +51,27 @@ function disk_sim_gpu(spec::SpecParams{T1}, disk::DiskParams{T1}, soldata::GPUSo
     threads5 = 1024
     blocks5 = cld(CUDA.length(prof), prod(threads5))
 
-    # allocate arrays for fresh copy of input data to copy to each loop
+    # allocate destinations for interpolations
     @cusync begin
-        bisall_gpu_loop = CUDA.zeros(T2, CUDA.size(bisall_gpu))
-        intall_gpu_loop = CUDA.zeros(T2, CUDA.size(intall_gpu))
-        widall_gpu_loop = CUDA.zeros(T2, CUDA.size(widall_gpu))
+        bisall_gpu_loop = CUDA.copy(bisall_gpu)
+        intall_gpu_loop = CUDA.copy(intall_gpu)
+        widall_gpu_loop = CUDA.copy(widall_gpu)
     end
 
+    # allocate memory for means
+    @cusync begin
+        bisall_mean = CUDA.zeros(CUDA.eltype(bisall_gpu_loop), 100, CUDA.size(bisall_gpu_loop, 3))
+        intall_mean = CUDA.zeros(CUDA.eltype(intall_gpu_loop), 100, CUDA.size(intall_gpu_loop, 3))
+        widall_mean = CUDA.zeros(CUDA.eltype(widall_gpu_loop), 100, CUDA.size(widall_gpu_loop, 3))
+    end
+
+    threads6 = (4, 16)
+    blocks6 = cld(length(lenall_gpu) * 100, prod(threads6))
+
+    @cusync @cuda threads=threads6 blocks=blocks6 time_average_bis!(lenall_gpu, bisall_mean, intall_mean, 
+                                                                    widall_mean, bisall_gpu, intall_gpu, 
+                                                                    widall_gpu)
+
     # get weighted disk average cbs
     @cusync sum_wts = CUDA.sum(wts)
     @cusync z_cbs_avg = CUDA.sum(z_cbs .* wts) / sum_wts
@@ -75,19 +89,14 @@ function disk_sim_gpu(spec::SpecParams{T1}, disk::DiskParams{T1}, soldata::GPUSo
 
         # loop over lines to synthesize
         for l in eachindex(spec.lines)
-            # get a fresh copy of the untrimmed bisector + width data
-            @cusync begin
-                CUDA.copyto!(bisall_gpu_loop, bisall_gpu)
-                CUDA.copyto!(intall_gpu_loop, intall_gpu)
-                CUDA.copyto!(widall_gpu_loop, widall_gpu)
-            end
-
             # trim all the bisector data
             @cusync @cuda threads=threads2 blocks=blocks2 trim_bisector_gpu!(spec.depths[l], spec.variability[l],
                                                                              depcontrast_gpu, lenall_gpu,
                                                                              bisall_gpu_loop, intall_gpu_loop,
                                                                              widall_gpu_loop, bisall_gpu,
-                                                                             intall_gpu, widall_gpu)
+                                                                             intall_gpu, widall_gpu, 
+                                                                             bisall_mean, intall_mean, 
+                                                                             widall_mean)
 
             # assemble line shape on even int grid
             @cusync @cuda threads=threads3 blocks=blocks3 fill_workspaces!(spec.lines[l], spec.variability[l],

diff --git a/src/gpu/gpu_trim.jl b/src/gpu/gpu_trim.jl
@@ -1,5 +1,6 @@
 function trim_bisector_gpu!(depth, variability, depcontrast, lenall, bisall_out,
-                            intall_out, widall_out, bisall_in, intall_in, widall_in)
+                            intall_out, widall_out, bisall_in, intall_in, widall_in,
+                            bisall_mean, intall_mean, widall_mean)
     # get indices from GPU blocks + threads
     idx = threadIdx().x + blockDim().x * (blockIdx().x-1)
     sdx = blockDim().x * gridDim().x
@@ -20,18 +21,19 @@ function trim_bisector_gpu!(depth, variability, depcontrast, lenall, bisall_out,
                 continue
             end
 
-            # get views of the correct time slice in input
-            bist_in = CUDA.view(bisall_in, :, j, i)
-            intt_in = CUDA.view(intall_in, :, j, i)
-
             # get views of the correct time slice in output
             bist_out = CUDA.view(bisall_out, :, j, i)
             intt_out = CUDA.view(intall_out, :, j, i)
-
-            # get step size for loop over length of bisector
-            step = dtrim/(CUDA.length(intt_in) - 1)
+            widt_out = CUDA.view(widall_out, :, j, i)
 
             if variability
+                # get views of the correct time slice in input
+                bist_in = CUDA.view(bisall_in, :, j, i)
+                intt_in = CUDA.view(intall_in, :, j, i)
+
+                # get step size for loop over length of bisector
+                step = dtrim/(CUDA.length(intt_in) - 1)
+
                 # set up interpolator
                 itp = linear_interp_gpu(intt_in, bist_in)
 
@@ -44,13 +46,64 @@ function trim_bisector_gpu!(depth, variability, depcontrast, lenall, bisall_out,
                     @inbounds intt_out[k] = new_intt
                 end
             else
+                # get views of the correct time slice in input
+                bist_in = CUDA.view(bisall_mean, :, i)
+                intt_in = CUDA.view(intall_mean, :, i)
+                widt_in = CUDA.view(widall_mean, :, i)
+
+                # get step size for loop over length of bisector
+                step = dtrim/(CUDA.length(intt_in) - 1)
+
+                # set up interpolator
+                itp = linear_interp_gpu(intt_in, bist_in)
+
+                # loop over the length of the bisector
                 for k in idz:sdz:CUDA.size(bisall_in, 1)
-                    @inbounds intt_out[k] = (1.0 - dtrim) + (k-1) * step
-                    @inbounds bist_out[k] = 0.0
-                    @inbounds widall_out[k,j,i] = widall_in[k,1,i]
+                    new_intt = (1.0 - dtrim) + (k-1) * step
+                    if (1.0 - dtrim) >= CUDA.first(intt_in)
+                        @inbounds bist_out[k] = itp(new_intt)
+                    end
+                    @inbounds intt_out[k] = new_intt
+                    @inbounds widt_out[k] = widt_in[k]
                 end
             end
         end
     end
     return nothing
 end
+
+function time_average_bis!(lenall, bisall_mean, intall_mean, widall_mean, bisall_in, intall_in, widall_in)
+    # get indices from GPU blocks + threads
+    idx = threadIdx().x + blockDim().x * (blockIdx().x-1)
+    sdx = blockDim().x * gridDim().x
+    idy = threadIdx().y + blockDim().y * (blockIdx().y-1)
+    sdy = blockDim().y * gridDim().y
+    # idz = threadIdx().z + blockDim().z * (blockIdx().z-1)
+    # sdz = blockDim().z * gridDim().z
+
+    # loop over disk positions for bisectors
+    for i in idx:sdx:CUDA.length(lenall)
+
+        # loop over intensity in bisector
+        for k in idy:sdy:CUDA.size(bisall_in, 1)
+
+            # holder for mean
+            bis_sum = CUDA.zero(eltype(bisall_mean))
+            int_sum = CUDA.zero(eltype(intall_mean))
+            wid_sum = CUDA.zero(eltype(widall_mean))
+
+            # loop over epochs of bisectors
+            for j in 1:lenall[i]
+                bis_sum += bisall_in[k, j, i]
+                int_sum += intall_in[k, j, i]
+                wid_sum += widall_in[k, j, i]
+            end
+
+            # take the mean and allocate 
+            @inbounds bisall_mean[k, i] = bis_sum / lenall[i]
+            @inbounds intall_mean[k, i] = int_sum / lenall[i]
+            @inbounds widall_mean[k, i] = wid_sum / lenall[i]
+        end        
+    end
+    return nothing 
+end