From 3aac2c3c2aa31ae741b31bf9c00051f9b4380577 Mon Sep 17 00:00:00 2001
From: Henry Le Berre <hberre3@gatech.edu>
Date: Fri, 23 Aug 2024 09:03:22 -0400
Subject: [PATCH] Two-stage IPO for NVHPC (#581)

Co-authored-by: Cameron <airedaledev@protonmail.com>
---
 .github/workflows/bench.yml               |   4 +-
 CMakeLists.txt                            | 239 ++++++++++++----------
 src/common/include/inline_conversions.fpp |  57 ------
 src/common/m_helper_basic.f90             |   1 +
 src/common/m_variables_conversion.fpp     |  61 +++++-
 src/post_process/m_derived_variables.fpp  |   4 -
 src/pre_process/m_data_output.fpp         |   2 -
 src/simulation/m_cbc.fpp                  |   3 -
 src/simulation/m_data_output.fpp          |   3 -
 src/simulation/m_riemann_solvers.fpp      |   5 +-
 toolchain/modules                         |   1 +
 11 files changed, 195 insertions(+), 185 deletions(-)
 delete mode 100644 src/common/include/inline_conversions.fpp

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 334d6e3eb..56ed3c009 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -52,8 +52,8 @@ jobs:
 
       - name: Generate & Post Comment
         run: |
-          . ./mfc.sh load -c p -m g
-          ./mfc.sh bench_diff master/bench-${{ matrix.device }}.yaml pr/bench-${{ matrix.device }}.yaml
+          (cd pr && . ./mfc.sh load -c p -m g)
+          (cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}.yaml ../pr/bench-${{ matrix.device }}.yaml)
 
       - name: Archive Logs
         uses: actions/upload-artifact@v3
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b72a285d..409fdad20 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,16 +172,13 @@ elseif ((CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC") OR (CMAKE_Fortran_COMPILER_
     add_compile_options(
         $<$<COMPILE_LANGUAGE:Fortran>:-Mfreeform>
         $<$<COMPILE_LANGUAGE:Fortran>:-cpp>
-        -Minfo=accel
+	    $<$<COMPILE_LANGUAGE:Fortran>:-Minfo=inline>
+        $<$<COMPILE_LANGUAGE:Fortran>:-Minfo=accel>
     )
 
-    if (CMAKE_BUILD_TYPE STREQUAL "Release")
-        add_compile_options(
-            $<$<COMPILE_LANGUAGE:Fortran:-minline>
-        )
-    elseif (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug")
         add_compile_options(
-            $<$<COMPILE_LANGUAGE:Fortran:-O0>
+            $<$<COMPILE_LANGUAGE:Fortran>:-O0>
         )
     endif()
 
@@ -208,13 +205,22 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
     endif()
 
     # Enable LTO/IPO if supported
-    CHECK_IPO_SUPPORTED(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR)
-    if (SUPPORTS_IPO)
-        message(STATUS "Enabled IPO / LTO")
-        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+    if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC")
+        if (MFC_Unified)
+            message(STATUS "IPO is not available with NVHPC using Unified Memory")
+        else()
+            message(STATUS "Performing IPO using -Mextract followed by -Minline")
+            set(NVHPC_USE_TWO_PASS_IPO TRUE)
+        endif()
     else()
-        message(STATUS "IPO / LTO is NOT available")
-    endif()
+        CHECK_IPO_SUPPORTED(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR)
+        if (SUPPORTS_IPO)
+            message(STATUS "Enabled IPO / LTO")
+            set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+	    else()
+            message(STATUS "IPO / LTO is NOT available")
+        endif()
+    endif() 
 endif()
 
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -365,124 +371,139 @@ function(MFC_SETUP_TARGET)
     cmake_parse_arguments(ARGS "OpenACC;MPI;SILO;HDF5;FFTW" "TARGET" "SOURCES" ${ARGN})
 
     add_executable(${ARGS_TARGET} ${ARGS_SOURCES})
-
-    set_target_properties(${ARGS_TARGET} PROPERTIES Fortran_PREPROCESS ON)
-
-    target_include_directories(${ARGS_TARGET} PRIVATE
-        "${CMAKE_SOURCE_DIR}/src/common"
-        "${CMAKE_SOURCE_DIR}/src/common/include"
-        "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}")
-
-    if (EXISTS "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}/include")
-        target_include_directories(${ARGS_TARGET} PRIVATE
-            "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}/include")
+    set(IPO_TARGETS ${ARGS_TARGET})
+    # Here we need to split into "library" and "executable" to perform IPO on the NVIDIA compiler.
+    # A little hacky, but it *is* an edge-case for *one* compiler.
+    if (NVHPC_USE_TWO_PASS_IPO)
+        add_library(${ARGS_TARGET}_lib OBJECT ${ARGS_SOURCES})
+        target_compile_options(${ARGS_TARGET}_lib PRIVATE 
+		    $<$<COMPILE_LANGUAGE:Fortran>:-Mextract=lib:${ARGS_TARGET}_lib>
+	        $<$<COMPILE_LANGUAGE:Fortran>:-Minline>
+    	)
+        add_dependencies(${ARGS_TARGET} ${ARGS_TARGET}_lib)
+        target_compile_options(${ARGS_TARGET} PRIVATE -Minline=lib:${ARGS_TARGET}_lib)
+        list(PREPEND IPO_TARGETS ${ARGS_TARGET}_lib)
     endif()
 
-    string(TOUPPER "${ARGS_TARGET}" ${ARGS_TARGET}_UPPER)
-    target_compile_definitions(
-        ${ARGS_TARGET} PRIVATE MFC_${CMAKE_Fortran_COMPILER_ID}
-                               MFC_${${ARGS_TARGET}_UPPER}
-    )
+    foreach (a_target ${IPO_TARGETS})
+        set_target_properties(${a_target} PROPERTIES Fortran_PREPROCESS ON)
 
-    if (MFC_MPI AND ARGS_MPI)
-        find_package(MPI COMPONENTS Fortran REQUIRED)
+        target_include_directories(${a_target} PRIVATE
+            "${CMAKE_SOURCE_DIR}/src/common"
+            "${CMAKE_SOURCE_DIR}/src/common/include"
+            "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}")
 
-        target_compile_definitions(${ARGS_TARGET} PRIVATE MFC_MPI)
-        target_link_libraries     (${ARGS_TARGET} PRIVATE MPI::MPI_Fortran)
-    endif()
+        if (EXISTS "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}/include")
+            target_include_directories(${a_target} PRIVATE
+                "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}/include")
+        endif()
 
-    if (ARGS_SILO)
-        find_package(SILO REQUIRED)
-        target_link_libraries(${ARGS_TARGET} PRIVATE SILO::SILO)
-    endif()
+        string(TOUPPER "${ARGS_TARGET}" ${ARGS_TARGET}_UPPER)
+        target_compile_definitions(
+            ${a_target} PRIVATE MFC_${CMAKE_Fortran_COMPILER_ID}
+                                MFC_${${ARGS_TARGET}_UPPER}
+        )
 
-    if (ARGS_HDF5)
-        find_package(HDF5 REQUIRED)
-        target_link_libraries(${ARGS_TARGET} PRIVATE HDF5::HDF5)
-    endif()
+        if (MFC_MPI AND ARGS_MPI)
+            find_package(MPI COMPONENTS Fortran REQUIRED)
 
-    if (ARGS_FFTW)
-        if (MFC_OpenACC AND ARGS_OpenACC)
-            if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
-                find_package(CUDAToolkit REQUIRED)
-                target_link_libraries(${ARGS_TARGET} PRIVATE CUDA::cudart CUDA::cufft)
-            else()
-                find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
-                target_link_libraries(${ARGS_TARGET} PRIVATE hipfort::hipfft)
-            endif()
-        else()
-            find_package(FFTW REQUIRED)
-            target_link_libraries(${ARGS_TARGET} PRIVATE FFTW::FFTW)
+            target_compile_definitions(${a_target} PRIVATE MFC_MPI)
+            target_link_libraries     (${a_target} PRIVATE MPI::MPI_Fortran)
         endif()
-    endif()
 
-    if (MFC_OpenACC AND ARGS_OpenACC)
-        find_package(OpenACC)
+        if (ARGS_SILO)
+            find_package(SILO REQUIRED)
+            target_link_libraries(${a_target} PRIVATE SILO::SILO)
+        endif()
 
-        # This should be equivalent to if (NOT OpenACC_FC_FOUND)
-        if (NOT TARGET OpenACC::OpenACC_Fortran)
-            message(FATAL_ERROR "OpenACC + Fortran is unsupported.")
+        if (ARGS_HDF5)
+            find_package(HDF5 REQUIRED)
+            target_link_libraries(${a_target} PRIVATE HDF5::HDF5)
         endif()
 
-        target_link_libraries(${ARGS_TARGET} PRIVATE OpenACC::OpenACC_Fortran)
-        target_compile_definitions(${ARGS_TARGET} PRIVATE MFC_OpenACC)
-
-        if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
-            # FIXME: This should work with other cards than gfx90a ones.
-            target_compile_options(${ARGS_TARGET} PRIVATE
-                "-foffload=amdgcn-amdhsa='-march=gfx90a'"
-                "-foffload-options=-lgfortran\ -lm"
-                "-fno-exceptions")
-        elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
-            find_package(cuTENSOR)
-            if (NOT cuTENSOR_FOUND)
-                message(WARNING
-                    "Failed to locate the NVIDIA cuTENSOR library. MFC will be "
-                    "built without support for it, disallowing the use of "
-                    "cu_tensor=T. This can result in degraded performance.")
+        if (ARGS_FFTW)
+            if (MFC_OpenACC AND ARGS_OpenACC)
+                if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
+                    find_package(CUDAToolkit REQUIRED)
+                    target_link_libraries(${a_target} PRIVATE CUDA::cudart CUDA::cufft)
+                else()
+                    find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
+                    target_link_libraries(${a_target} PRIVATE hipfort::hipfft)
+                endif()
             else()
-                target_link_libraries     (${ARGS_TARGET} PRIVATE cuTENSOR::cuTENSOR)
-                target_compile_definitions(${ARGS_TARGET} PRIVATE MFC_cuTENSOR)
+                find_package(FFTW REQUIRED)
+                target_link_libraries(${a_target} PRIVATE FFTW::FFTW)
             endif()
+        endif()
 
-            foreach (cc ${MFC_CUDA_CC})
-                target_compile_options(${ARGS_TARGET}
-                    PRIVATE -gpu=cc${cc}
-                )
-            endforeach()
-
-            target_compile_options(${ARGS_TARGET}
-                PRIVATE -gpu=keep,ptxinfo,lineinfo
-            )
+        if (MFC_OpenACC AND ARGS_OpenACC)
+            find_package(OpenACC)
 
-            # GH-200 Unified Memory Support
-            if (MFC_Unified)
-                target_compile_options(${ARGS_TARGET}
-                    PRIVATE -gpu=unified
-                )
-                # "This option must appear in both the compile and link lines" -- NVHPC Docs
-                target_link_options(${ARGS_TARGET}
-                    PRIVATE -gpu=unified
-                )
+            # This should be equivalent to if (NOT OpenACC_FC_FOUND)
+            if (NOT TARGET OpenACC::OpenACC_Fortran)
+                message(FATAL_ERROR "OpenACC + Fortran is unsupported.")
             endif()
 
-            if (CMAKE_BUILD_TYPE STREQUAL "Debug")
-                target_compile_options(${ARGS_TARGET}
-                    PRIVATE -gpu=autocompare,debug
+            target_link_libraries(${a_target} PRIVATE OpenACC::OpenACC_Fortran)
+            target_compile_definitions(${a_target} PRIVATE MFC_OpenACC)
+
+            if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
+                # FIXME: This should work with other cards than gfx90a ones.
+                target_compile_options(${a_target} PRIVATE
+                    "-foffload=amdgcn-amdhsa='-march=gfx90a'"
+                    "-foffload-options=-lgfortran\ -lm"
+                    "-fno-exceptions")
+            elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
+                find_package(cuTENSOR)
+                if (NOT cuTENSOR_FOUND)
+                    message(WARNING
+                        "Failed to locate the NVIDIA cuTENSOR library. MFC will be "
+                        "built without support for it, disallowing the use of "
+                        "cu_tensor=T. This can result in degraded performance.")
+                else()
+                    target_link_libraries     (${a_target} PRIVATE cuTENSOR::cuTENSOR)
+                    target_compile_definitions(${a_target} PRIVATE MFC_cuTENSOR)
+                endif()
+
+                foreach (cc ${MFC_CUDA_CC})
+                    target_compile_options(${a_target}
+                        PRIVATE -gpu=cc${cc}
+                    )
+                endforeach()
+
+                target_compile_options(${a_target}
+                    PRIVATE -gpu=keep,ptxinfo,lineinfo
                 )
+
+                # GH-200 Unified Memory Support
+                if (MFC_Unified)
+                    target_compile_options(${ARGS_TARGET}
+                        PRIVATE -gpu=unified
+                    )
+                    # "This option must appear in both the compile and link lines" -- NVHPC Docs
+                    target_link_options(${ARGS_TARGET}
+                        PRIVATE -gpu=unified
+                    )
+                endif()
+
+                if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+                    target_compile_options(${a_target}
+                        PRIVATE -gpu=autocompare,debug
+                    )
+                endif()
+            elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
+                find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
+                target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn)
             endif()
-        elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
-            find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
-            target_link_libraries(${ARGS_TARGET} PRIVATE hipfort::hip hipfort::hipfort-amdgcn)
+        elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
+            target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
         endif()
-    elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
-        target_compile_options(${ARGS_TARGET} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
-    endif()
 
-    if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
-        find_package(CUDAToolkit REQUIRED)
-        target_link_libraries(${ARGS_TARGET} PRIVATE CUDA::nvToolsExt)
-    endif()
+        if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
+            find_package(CUDAToolkit REQUIRED)
+            target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+        endif()
+    endforeach()
 
     install(TARGETS ${ARGS_TARGET} RUNTIME DESTINATION bin)
 endfunction()
diff --git a/src/common/include/inline_conversions.fpp b/src/common/include/inline_conversions.fpp
deleted file mode 100644
index a63af61e1..000000000
--- a/src/common/include/inline_conversions.fpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#:def s_compute_speed_of_sound()
-    subroutine s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, adv, vel_sum, c)
-#ifdef CRAY_ACC_WAR
-        !DIR$ INLINEALWAYS s_compute_speed_of_sound
-#else
-        !$acc routine seq
-#endif
-        real(kind(0d0)), intent(in) :: pres
-        real(kind(0d0)), intent(in) :: rho, gamma, pi_inf
-        real(kind(0d0)), intent(in) :: H
-        real(kind(0d0)), dimension(num_fluids), intent(in) :: adv
-        real(kind(0d0)), intent(in) :: vel_sum
-        real(kind(0d0)), intent(out) :: c
-
-        real(kind(0d0)) :: blkmod1, blkmod2
-
-        integer :: q
-
-        if (alt_soundspeed) then
-            blkmod1 = ((gammas(1) + 1d0)*pres + &
-                       pi_infs(1))/gammas(1)
-            blkmod2 = ((gammas(2) + 1d0)*pres + &
-                       pi_infs(2))/gammas(2)
-            c = (1d0/(rho*(adv(1)/blkmod1 + adv(2)/blkmod2)))
-        elseif (model_eqns == 3) then
-            c = 0d0
-            !$acc loop seq
-            do q = 1, num_fluids
-                c = c + adv(q)*(1d0/gammas(q) + 1d0)* &
-                    (pres + pi_infs(q)/(gammas(q) + 1d0))
-            end do
-            c = c/rho
-
-        elseif (((model_eqns == 4) .or. (model_eqns == 2 .and. bubbles))) then
-            ! Sound speed for bubble mmixture to order O(\alpha)
-
-            if (mpp_lim .and. (num_fluids > 1)) then
-                c = (1d0/gamma + 1d0)* &
-                    (pres + pi_inf/(gamma + 1d0))/rho
-            else
-                c = &
-                    (1d0/gamma + 1d0)* &
-                    (pres + pi_inf/(gamma + 1d0))/ &
-                    (rho*(1d0 - adv(num_fluids)))
-            end if
-        else
-            c = ((H - 5d-1*vel_sum)/gamma)
-        end if
-
-        if (mixture_err .and. c < 0d0) then
-            c = 100.d0*sgm_eps
-        else
-            c = sqrt(c)
-        end if
-    end subroutine s_compute_speed_of_sound
-#:enddef
-
diff --git a/src/common/m_helper_basic.f90 b/src/common/m_helper_basic.f90
index 9f4c6523b..0611ff86f 100644
--- a/src/common/m_helper_basic.f90
+++ b/src/common/m_helper_basic.f90
@@ -25,6 +25,7 @@ module m_helper_basic
     !! @param tol_input Relative error (default = 1d-6).
     !! @return Result of the comparison.
     logical function f_approx_equal(a, b, tol_input) result(res)
+        !$acc routine seq
         ! Reference: https://floating-point-gui.de/errors/comparison/
 
         real(kind(0d0)), intent(in) :: a, b
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 46436cb8b..180abefd6 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_variables_conversion
 
 #:include 'macros.fpp'
-#:include 'inline_conversions.fpp'
 #:include 'case.fpp'
 
 !> @brief This module consists of subroutines used in the conversion of the
@@ -40,6 +39,9 @@ module m_variables_conversion
               s_convert_primitive_to_conservative_variables, &
               s_convert_primitive_to_flux_variables, &
               s_compute_pressure, &
+#ifndef MFC_PRE_PROCESS
+              s_compute_speed_of_sound, &
+#endif
               s_finalize_variables_conversion_module
 
     !> Abstract interface to two subroutines designed for the transfer/conversion
@@ -1339,4 +1341,61 @@ contains
 
     end subroutine s_finalize_variables_conversion_module
 
+#ifndef MFC_PRE_PROCESS
+    subroutine s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, adv, vel_sum, c)
+#ifdef CRAY_ACC_WAR
+        !DIR$ INLINEALWAYS s_compute_speed_of_sound
+#else
+        !$acc routine seq
+#endif
+        real(kind(0d0)), intent(in) :: pres
+        real(kind(0d0)), intent(in) :: rho, gamma, pi_inf
+        real(kind(0d0)), intent(in) :: H
+        real(kind(0d0)), dimension(num_fluids), intent(in) :: adv
+        real(kind(0d0)), intent(in) :: vel_sum
+        real(kind(0d0)), intent(out) :: c
+
+        real(kind(0d0)) :: blkmod1, blkmod2
+
+        integer :: q
+
+        if (alt_soundspeed) then
+            blkmod1 = ((gammas(1) + 1d0)*pres + &
+                       pi_infs(1))/gammas(1)
+            blkmod2 = ((gammas(2) + 1d0)*pres + &
+                       pi_infs(2))/gammas(2)
+            c = (1d0/(rho*(adv(1)/blkmod1 + adv(2)/blkmod2)))
+        elseif (model_eqns == 3) then
+            c = 0d0
+            !$acc loop seq
+            do q = 1, num_fluids
+                c = c + adv(q)*(1d0/gammas(q) + 1d0)* &
+                    (pres + pi_infs(q)/(gammas(q) + 1d0))
+            end do
+            c = c/rho
+
+        elseif (((model_eqns == 4) .or. (model_eqns == 2 .and. bubbles))) then
+            ! Sound speed for bubble mmixture to order O(\alpha)
+
+            if (mpp_lim .and. (num_fluids > 1)) then
+                c = (1d0/gamma + 1d0)* &
+                    (pres + pi_inf/(gamma + 1d0))/rho
+            else
+                c = &
+                    (1d0/gamma + 1d0)* &
+                    (pres + pi_inf/(gamma + 1d0))/ &
+                    (rho*(1d0 - adv(num_fluids)))
+            end if
+        else
+            c = ((H - 5d-1*vel_sum)/gamma)
+        end if
+
+        if (mixture_err .and. c < 0d0) then
+            c = 100.d0*sgm_eps
+        else
+            c = sqrt(c)
+        end if
+    end subroutine s_compute_speed_of_sound
+#endif
+
 end module m_variables_conversion
diff --git a/src/post_process/m_derived_variables.fpp b/src/post_process/m_derived_variables.fpp
index ba6afbdcb..e08973bd2 100644
--- a/src/post_process/m_derived_variables.fpp
+++ b/src/post_process/m_derived_variables.fpp
@@ -8,8 +8,6 @@
 !!      volume fraction, specific heat ratio, liquid stiffness, speed of
 !!      sound, vorticity and the numerical Schlieren function.
 
-#:include 'inline_conversions.fpp'
-
 module m_derived_variables
 
     ! Dependencies =============================================================
@@ -561,8 +559,6 @@ contains
 
     end subroutine s_derive_qm
 
-    @:s_compute_speed_of_sound()
-
     !>  This subroutine gets as inputs the conservative variables
         !!      and density. From those inputs, it proceeds to calculate
         !!      the values of the numerical Schlieren function, which are
diff --git a/src/pre_process/m_data_output.fpp b/src/pre_process/m_data_output.fpp
index 94d5d8fb6..b650bc98e 100644
--- a/src/pre_process/m_data_output.fpp
+++ b/src/pre_process/m_data_output.fpp
@@ -2,8 +2,6 @@
 !! @file m_data_output.f90
 !! @brief Contains module m_data_output
 
-#:include 'inline_conversions.fpp'
-
 !> @brief This module takes care of writing the grid and initial condition
 !!              data files into the "0" time-step directory located in the folder
 !!              associated with the rank of the local processor, which is a sub-
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 793d7555d..5b95b6d92 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -19,7 +19,6 @@
 !!              Please refer to Thompson (1987, 1990) for detailed descriptions.
 
 #:include 'macros.fpp'
-#:include 'inline_conversions.fpp'
 
 module m_cbc
 
@@ -144,8 +143,6 @@ module m_cbc
 
 contains
 
-    @:s_compute_speed_of_sound()
-
     !>  The computation of parameters, the allocation of memory,
         !!      the association of pointers and/or the execution of any
         !!      other procedures that are necessary to setup the module.
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 47970831d..a28d6b0f3 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -3,7 +3,6 @@
 !! @brief Contains module m_data_output
 
 #:include 'macros.fpp'
-#:include 'inline_conversions.fpp'
 
 !> @brief The primary purpose of this module is to output the grid and the
 !!              conservative variables data at the chosen time-step interval. In
@@ -101,8 +100,6 @@ module m_data_output
 
 contains
 
-    @:s_compute_speed_of_sound()
-
     !>  The purpose of this subroutine is to open a new or pre-
         !!          existing run-time information file and append to it the
         !!      basic header information relevant to current simulation.
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index bfca71428..929476b5f 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -20,7 +20,6 @@
 
 #:include 'macros.fpp'
 #:include 'inline_riemann.fpp'
-#:include 'inline_conversions.fpp'
 
 module m_riemann_solvers
 
@@ -264,9 +263,7 @@ module m_riemann_solvers
 
 contains
 
-    @:s_compute_speed_of_sound()
-
-    subroutine s_hll_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+    subroutine s_hll_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, & ! -------
                                     dqL_prim_dy_vf, &
                                     dqL_prim_dz_vf, &
                                     qL_prim_vf, &
diff --git a/toolchain/modules b/toolchain/modules
index 15be3b4f7..0dc9576c8 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -42,6 +42,7 @@ e-gpu gpu/0.15.4 cuda/11.0.2 nvhpc/22.2 openmpi/4.0.5 cmake/3.19.8
 e-gpu CC=nvc CXX=nvc++ FC=nvfortran
 
 p     GT Phoenix
+p-all python/3.10.10
 p-cpu gcc/12.3.0 openmpi/4.1.5
 p-gpu nvhpc/24.5 hpcx/2.19-cuda cuda/12.1.1