From cacab2ef14151f1052ed9a3d13c49c80b098d0f2 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Wed, 20 Sep 2023 23:46:20 +0200
Subject: [PATCH 1/8] Add compression options to PUMgen

---
 src/pumgen.cpp | 456 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 319 insertions(+), 137 deletions(-)
diff --git a/src/pumgen.cpp b/src/pumgen.cpp
index 57a046a..0fd84e5 100644
--- a/src/pumgen.cpp
+++ b/src/pumgen.cpp
@@ -10,6 +10,8 @@
  * @author Sebastian Rettenberger <sebastian.rettenberger@tum.de>
  */
 
+#include <H5Ppublic.h>
+#include <H5Zpublic.h>
 #include <mpi.h>
 
 #include <algorithm>
@@ -18,6 +20,8 @@
 #include <fstream>
 #include <limits>
 #include <string>
+#include <memory>
+#include <functional>
 
 #include <hdf5.h>
 
@@ -26,6 +30,7 @@
 // #include <apfZoltan.h>
 #include <maMesh.h>
 
+#include "third_party/MPITraits.h"
 #include "utils/args.h"
 #include "utils/logger.h"
 
@@ -43,13 +48,33 @@
 #include "meshreader/ParallelGambitReader.h"
 #include "third_party/GMSH2Parser.h"
 
-template <typename TT> static void _checkH5Err(TT status, const char* file, int line) {
-  if (status < 0)
+template <typename TT> static TT _checkH5Err(TT&& status, const char* file, int line) {
+  if (status < 0) {
     logError() << utils::nospace << "An HDF5 error occurred (" << file << ": " << line << ")";
+  }
+  return std::forward<TT>(status);
 }
 
 #define checkH5Err(...) _checkH5Err(__VA_ARGS__, __FILE__, __LINE__)
 
+static int ilog(std::size_t value, int expbase=1) {
+  int count = 0;
+  while (value > 0) {
+    value >>= expbase;
+    ++count;
+  }
+  return count;
+}
+
+template<typename F>
+static void iterate(apf::Mesh* mesh, int dim, F&& function) {
+  apf::MeshIterator* it = mesh->begin(dim);
+  while (apf::MeshEntity* element = mesh->iterate(it)) {
+    std::invoke(function, element);
+  }
+  mesh->end(it);
+}
+
 int main(int argc, char* argv[]) {
   int rank = 0;
   int processes = 1;
@@ -68,6 +93,12 @@ int main(int argc, char* argv[]) {
   args.addOption("dump", 'd', "Dump APF mesh before partitioning it", utils::Args::Required, false);
   args.addOption("model", 0, "Dump/Load a specific model file", utils::Args::Required, false);
   args.addOption("vtk", 0, "Dump mesh to VTK files", utils::Args::Required, false);
+
+  const char* filters[] = { "none", "scaleoffset", "deflate1", "deflate2", "deflate3", "deflate4", "deflate5", "deflate6", "deflate7", "deflate8", "deflate9" };
+  args.addOption("compactify-datatypes", 0, "Compress index and group data types to minimum byte size (no HDF5 filters)", utils::Args::Required, false);
+  args.addEnumOption("filter-enable", filters, 0, "Apply HDF5 filters (i.e. compression). Disabled by default.", false);
+  args.addOption("filter-chunksize", 0, "Chunksize for filters (default=4096).", utils::Args::Required, false);
+
   args.addOption("license", 'l', "License file (only used by SimModSuite)", utils::Args::Required,
                  false);
 #ifdef PARASOLID
@@ -108,13 +139,36 @@ int main(int argc, char* argv[]) {
     outputFile.append(".puml.h5");
   }
 
+  bool reduceInts = args.isSet("compactify-datatypes");
+  int filterEnable = args.getArgument("filter-enable", 0);
+  std::size_t filterChunksize = args.getArgument<std::size_t>("filter-chunksize", 4096);
+  bool applyFilters = filterEnable > 0;
+  if (reduceInts) {
+    logInfo(rank) << "Using compact integer types.";
+  }
+  if (filterEnable == 0) {
+    logInfo(rank) << "No filtering enabled (contiguous storage)";
+  }
+  else {
+    logInfo(rank) << "Using filtering. Chunksize:" << filterChunksize;
+    if (filterEnable == 1) {
+      logInfo(rank) << "Compression: scale-offset compression for integers (disabled for floats)";
+    }
+    else if (filterEnable < 11) {
+      logInfo(rank) << "Compression: deflate, strength" << filterEnable - 1 << "(note: this requires HDF5 to be compiled with GZIP support; this applies to SeisSol as well)";
+    }
+  }
+
   std::string xdmfFile = outputFile;
-  if (utils::StringUtils::endsWith(outputFile, ".puml.h5"))
+  if (utils::StringUtils::endsWith(outputFile, ".puml.h5")) {
     utils::StringUtils::replaceLast(xdmfFile, ".puml.h5", ".xdmf");
-  if (utils::StringUtils::endsWith(outputFile, ".h5"))
+  }
+  if (utils::StringUtils::endsWith(outputFile, ".h5")) {
     utils::StringUtils::replaceLast(xdmfFile, ".h5", ".xdmf");
-  else
+  }
+  else {
     xdmfFile.append(".xdmf");
+  }
 
   // Create/read the mesh
   MeshInput* meshInput = 0L;
@@ -191,11 +245,29 @@ int main(int argc, char* argv[]) {
     apf::writeVtkFiles(vtkPrefix, mesh);
   }
 
+  apf::Sharing* sharing = apf::getSharing(mesh);
+
+  // oriented at the apf::countOwned method ... But with size_t
+  auto countOwnedLong = [sharing, mesh](int dim) {
+    std::size_t counter = 0;
+
+    apf::MeshIterator* it = mesh->begin(dim);
+    while (apf::MeshEntity* element = mesh->iterate(it)) {
+      if (sharing->isOwned(element)) {
+        ++counter;
+      }
+    }
+    mesh->end(it);
+
+    return counter;
+  };
+
+  // TODO(David): replace by apf::countOwned again once extended
+
   // Get local/global size
-  unsigned int localSize[2] = {static_cast<unsigned int>(apf::countOwned(mesh, 3)),
-                               static_cast<unsigned int>(apf::countOwned(mesh, 0))};
-  unsigned long globalSize[2] = {localSize[0], localSize[1]};
-  MPI_Allreduce(MPI_IN_PLACE, globalSize, 2, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
+  std::size_t localSize[2] = {countOwnedLong(3), countOwnedLong(0)};
+  std::size_t globalSize[2] = {localSize[0], localSize[1]};
+  MPI_Allreduce(MPI_IN_PLACE, globalSize, 2, tndm::mpi_type_t<std::size_t>(), MPI_SUM, MPI_COMM_WORLD);
 
   logInfo(rank) << "Mesh size:" << globalSize[0];
 
@@ -210,8 +282,8 @@ int main(int argc, char* argv[]) {
   logInfo(rank) << "Minimum insphere found:" << min;
 
   // Get offsets
-  unsigned long offsets[2] = {localSize[0], localSize[1]};
-  MPI_Scan(MPI_IN_PLACE, offsets, 2, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
+  std::size_t offsets[2] = {localSize[0], localSize[1]};
+  MPI_Scan(MPI_IN_PLACE, offsets, 2, tndm::mpi_type_t<std::size_t>(), MPI_SUM, MPI_COMM_WORLD);
   offsets[0] -= localSize[0];
   offsets[1] -= localSize[1];
 
@@ -232,113 +304,209 @@ int main(int argc, char* argv[]) {
   checkH5Err(h5file);
   checkH5Err(H5Pclose(h5falist));
 
+  hid_t h5dxlist = H5Pcreate(H5P_DATASET_XFER);
+  checkH5Err(h5dxlist);
+  checkH5Err(H5Pset_dxpl_mpio(h5dxlist, H5FD_MPIO_COLLECTIVE));
+
   // Write cells
+  std::size_t connectSize = 8;
   logInfo(rank) << "Writing cells";
+  {
+    hsize_t sizes[2] = {globalSize[0], 4};
+    hid_t h5space = H5Screate_simple(2, sizes, 0L);
+    checkH5Err(h5space);
 
-  hsize_t sizes[2] = {globalSize[0], 4};
-  hid_t h5space = H5Screate_simple(2, sizes, 0L);
-  checkH5Err(h5space);
-
-  hid_t h5connect =
-      H5Dcreate(h5file, "/connect", H5T_STD_U64LE, h5space, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-  checkH5Err(h5connect);
+    hid_t connectType = H5T_STD_U64LE;
+    if (reduceInts) {
+      connectType = H5Tcopy(H5T_STD_U64LE);
+      std::size_t bits = ilog(globalSize[0]);
+      std::size_t unsignedSize = (bits + 7) / 8;
+      checkH5Err(connectType);
+      checkH5Err(H5Tset_size(connectType, unsignedSize));
+      checkH5Err(H5Tcommit(h5file, "/connectType", connectType, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT));
+    }
 
-  hsize_t start[2] = {offsets[0], 0};
-  hsize_t count[2] = {localSize[0], 4};
-  checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
+    hid_t connectFilter = H5P_DEFAULT;
+    if (applyFilters) {
+      connectFilter = checkH5Err(H5Pcreate(H5P_DATASET_CREATE));
+      hsize_t chunk[2] = {std::min(filterChunksize, sizes[0]), 4};
+      checkH5Err(H5Pset_chunk(connectFilter, 2, chunk));
+      if (filterEnable == 1) {
+        checkH5Err(H5Pset_scaleoffset(connectFilter, H5Z_SO_INT, H5Z_SO_INT_MINBITS_DEFAULT));
+      }
+      else if (filterEnable < 11) {
+        int deflateStrength = filterEnable - 1;
+        checkH5Err(H5Pset_deflate(connectFilter, deflateStrength));
+      }
+    }
 
-  sizes[0] = localSize[0];
-  hid_t h5memspace = H5Screate_simple(2, sizes, 0L);
-  checkH5Err(h5memspace);
+    hid_t h5connect =
+        H5Dcreate(h5file, "/connect", connectType, h5space, H5P_DEFAULT, connectFilter, H5P_DEFAULT);
+    checkH5Err(h5connect);
 
-  hid_t h5dxlist = H5Pcreate(H5P_DATASET_XFER);
-  checkH5Err(h5dxlist);
-  checkH5Err(H5Pset_dxpl_mpio(h5dxlist, H5FD_MPIO_COLLECTIVE));
+    hsize_t start[2] = {offsets[0], 0};
+    hsize_t count[2] = {localSize[0], 4};
+    checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
 
-  unsigned long* connect = new unsigned long[localSize[0] * 4];
-  it = mesh->begin(3);
-  unsigned int index = 0;
-  while (apf::MeshEntity* element = mesh->iterate(it)) {
-    apf::NewArray<long> vn;
-    apf::getElementNumbers(vertexNum, element, vn);
+    sizes[0] = localSize[0];
+    hid_t h5memspace = H5Screate_simple(2, sizes, 0L);
+    checkH5Err(h5memspace);
 
-    for (unsigned int i = 0; i < 4; i++)
-      connect[index * 4 + i] = vn[i];
+    std::vector<unsigned long> connect(localSize[0] * 4);
+    it = mesh->begin(3);
+    std::size_t index = 0;
+    while (apf::MeshEntity* element = mesh->iterate(it)) {
+      apf::NewArray<long> vn;
+      apf::getElementNumbers(vertexNum, element, vn);
 
-    index++;
-  }
-  mesh->end(it);
+      for (int i = 0; i < 4; i++) {
+        connect[index * 4 + i] = vn[i];
+      }
 
-  checkH5Err(H5Dwrite(h5connect, H5T_NATIVE_ULONG, h5memspace, h5space, h5dxlist, connect));
+      index++;
+    }
+    mesh->end(it);
 
-  checkH5Err(H5Sclose(h5space));
-  checkH5Err(H5Sclose(h5memspace));
-  checkH5Err(H5Dclose(h5connect));
+    checkH5Err(H5Dwrite(h5connect, H5T_NATIVE_ULONG, h5memspace, h5space, h5dxlist, connect.data()));
 
-  delete[] connect;
+    if (applyFilters) {
+      checkH5Err(H5Pclose(connectFilter));
+    }
+    if (reduceInts) {
+      checkH5Err(H5Tclose(connectType));
+    }
+    checkH5Err(H5Sclose(h5space));
+    checkH5Err(H5Sclose(h5memspace));
+    checkH5Err(H5Dclose(h5connect));
+  }
 
   // Vertices
   logInfo(rank) << "Writing vertices";
+  {
+    hsize_t sizes[2] = {0,0};
+    hsize_t start[2] = {0,0};
+    hsize_t count[2] = {0,0};
+    sizes[0] = globalSize[1];
+    sizes[1] = 3;
+    hid_t h5space = H5Screate_simple(2, sizes, 0L);
+    checkH5Err(h5space);
 
-  sizes[0] = globalSize[1];
-  sizes[1] = 3;
-  h5space = H5Screate_simple(2, sizes, 0L);
-  checkH5Err(h5space);
-
-  hid_t h5geometry = H5Dcreate(h5file, "/geometry", H5T_IEEE_F64LE, h5space, H5P_DEFAULT,
-                               H5P_DEFAULT, H5P_DEFAULT);
-  checkH5Err(h5geometry);
+    hid_t geometryFilter = H5P_DEFAULT;
+    if (applyFilters && filterEnable > 1) {
+      geometryFilter = checkH5Err(H5Pcreate(H5P_DATASET_CREATE));
+      hsize_t chunk[2] = {std::min(filterChunksize, sizes[0]), 3};
+      checkH5Err(H5Pset_chunk(geometryFilter, 2, chunk));
+      if (filterEnable == 1) {
+        // float compression disabled at the moment (would be lossy)
+        // checkH5Err(H5Pset_scaleoffset(geometryFilter, H5Z_SO_FLOAT_DSCALE, H5Z_SO_INT_MINBITS_DEFAULT));
+      }
+      else if (filterEnable < 11) {
+        int deflateStrength = filterEnable - 1;
+        checkH5Err(H5Pset_deflate(geometryFilter, deflateStrength));
+      }
+    }
 
-  start[0] = offsets[1];
-  count[0] = localSize[1];
-  count[1] = 3;
-  checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
+    hid_t h5geometry = H5Dcreate(h5file, "/geometry", H5T_IEEE_F64LE, h5space, H5P_DEFAULT,
+                                geometryFilter, H5P_DEFAULT);
+    checkH5Err(h5geometry);
 
-  sizes[0] = localSize[1];
-  h5memspace = H5Screate_simple(2, sizes, 0L);
-  checkH5Err(h5memspace);
+    start[0] = offsets[1];
+    count[0] = localSize[1];
+    count[1] = 3;
+    checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
 
-  apf::Sharing* sharing = apf::getSharing(mesh);
+    sizes[0] = localSize[1];
+    hid_t h5memspace = H5Screate_simple(2, sizes, 0L);
+    checkH5Err(h5memspace);
 
-  double* geometry = new double[localSize[1] * 3];
-  it = mesh->begin(0);
-  index = 0;
-  while (apf::MeshEntity* element = mesh->iterate(it)) {
-    if (!sharing->isOwned(element))
-      continue;
+    std::vector<double> geometry(localSize[1] * 3);
+    it = mesh->begin(0);
+    std::size_t index = 0;
+    while (apf::MeshEntity* element = mesh->iterate(it)) {
+      if (!sharing->isOwned(element)) {
+        continue;
+      }
 
-    long gid = apf::getNumber(vertexNum, apf::Node(element, 0));
+      long gid = apf::getNumber(vertexNum, apf::Node(element, 0));
 
-    if (gid != static_cast<long>(offsets[1] + index))
-      logError() << "Global vertex numbering is incorrect";
+      if (gid != static_cast<long>(offsets[1] + index)) {
+        logError() << "Global vertex numbering is incorrect";
+      }
 
-    apf::Vector3 point;
-    mesh->getPoint(element, 0, point);
-    point.toArray(&geometry[index * 3]);
+      apf::Vector3 point;
+      mesh->getPoint(element, 0, point);
+      point.toArray(&geometry[index * 3]);
 
-    index++;
-  }
-  mesh->end(it);
+      index++;
+    }
+    mesh->end(it);
 
-  checkH5Err(H5Dwrite(h5geometry, H5T_NATIVE_DOUBLE, h5memspace, h5space, h5dxlist, geometry));
+    checkH5Err(H5Dwrite(h5geometry, H5T_NATIVE_DOUBLE, h5memspace, h5space, h5dxlist, geometry.data()));
 
-  checkH5Err(H5Sclose(h5space));
-  checkH5Err(H5Sclose(h5memspace));
-  checkH5Err(H5Dclose(h5geometry));
+    if (applyFilters && filterEnable > 1) {
+      checkH5Err(H5Pclose(geometryFilter));
+    }
 
-  delete[] geometry;
+    checkH5Err(H5Sclose(h5space));
+    checkH5Err(H5Sclose(h5memspace));
+    checkH5Err(H5Dclose(h5geometry));
+  }
 
   // Group information
   apf::MeshTag* groupTag = mesh->findTag("group");
+  std::size_t groupSize = 4;
   if (groupTag) {
     logInfo(rank) << "Writing group information";
 
+    hsize_t sizes[1] = {0};
+    hsize_t start[1] = {0};
+    hsize_t count[1] = {0};
+
     sizes[0] = globalSize[0];
-    h5space = H5Screate_simple(1, sizes, 0L);
+    hid_t h5space = H5Screate_simple(1, sizes, 0L);
     checkH5Err(h5space);
 
+    std::vector<int> group(localSize[0]);
+    it = mesh->begin(3);
+    std::size_t index = 0;
+    while (apf::MeshEntity* element = mesh->iterate(it)) {
+      assert(mesh->hasTag(element, groupTag));
+
+      mesh->getIntTag(element, groupTag, &group[index]);
+      index++;
+    }
+    mesh->end(it);
+
+    hid_t groupType = H5T_STD_I32LE;
+    if (reduceInts) {
+      groupType = H5Tcopy(H5T_STD_I32LE);
+      uint32_t minvalue = std::max(-(*std::min_element(group.begin(), group.end()) + 1), 0);
+      uint32_t maxvalue = *std::max_element(group.begin(), group.end());
+      uint32_t totalmax = std::max(minvalue, maxvalue);
+      MPI_Allreduce(MPI_IN_PLACE, &totalmax, 1, MPI_UNSIGNED, MPI_MAX, MPI_COMM_WORLD);
+      std::size_t bits = ilog(totalmax);
+      std::size_t signedSize = (bits + 7 + 1) / 8;
+      checkH5Err(groupType);
+      checkH5Err(H5Tset_size(groupType, signedSize));
+      checkH5Err(H5Tcommit(h5file, "/groupType", groupType, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT));
+    }
+
+    hid_t groupFilter = H5P_DEFAULT;
+    if (applyFilters) {
+      groupFilter = checkH5Err(H5Pcreate(H5P_DATASET_CREATE));
+      hsize_t chunk[1] = {std::min(filterChunksize, sizes[0])};
+      checkH5Err(H5Pset_chunk(groupFilter, 1, chunk));
+      if (filterEnable == 1) {
+        checkH5Err(H5Pset_scaleoffset(groupFilter, H5Z_SO_INT, H5Z_SO_INT_MINBITS_DEFAULT));
+      }
+      else if (filterEnable < 11) {
+        int deflateStrength = filterEnable - 1;
+        checkH5Err(H5Pset_deflate(groupFilter, deflateStrength));
+      }
+    }
+
     hid_t h5group =
-        H5Dcreate(h5file, "/group", H5T_STD_I32LE, h5space, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        H5Dcreate(h5file, "/group", H5T_STD_I32LE, h5space, H5P_DEFAULT, groupFilter, H5P_DEFAULT);
     checkH5Err(h5group);
 
     start[0] = offsets[0];
@@ -346,89 +514,99 @@ int main(int argc, char* argv[]) {
     checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
 
     sizes[0] = localSize[0];
-    h5memspace = H5Screate_simple(1, sizes, 0L);
+    hid_t h5memspace = H5Screate_simple(1, sizes, 0L);
     checkH5Err(h5memspace);
 
-    int* group = new int[localSize[0]];
-    it = mesh->begin(3);
-    index = 0;
-    while (apf::MeshEntity* element = mesh->iterate(it)) {
-      assert(mesh->hasTag(element, groupTag));
+    checkH5Err(H5Dwrite(h5group, H5T_NATIVE_INT, h5memspace, h5space, h5dxlist, group.data()));
 
-      mesh->getIntTag(element, groupTag, &group[index]);
-      index++;
+    if (applyFilters) {
+      checkH5Err(H5Pclose(groupFilter));
+    }
+    if (reduceInts) {
+      checkH5Err(H5Tclose(groupType));
     }
-    mesh->end(it);
-
-    checkH5Err(H5Dwrite(h5group, H5T_NATIVE_INT, h5memspace, h5space, h5dxlist, group));
-
     checkH5Err(H5Sclose(h5space));
     checkH5Err(H5Sclose(h5memspace));
     checkH5Err(H5Dclose(h5group));
-
-    delete[] group;
   } else {
     logInfo() << "No group information found in mesh";
   }
 
   // Write boundary condition
   logInfo(rank) << "Writing boundary condition";
-  apf::MeshTag* boundaryTag = mesh->findTag("boundary condition");
-  assert(boundaryTag);
+  {
+    apf::MeshTag* boundaryTag = mesh->findTag("boundary condition");
+    assert(boundaryTag);
 
-  int* boundary = new int[localSize[0]];
-  memset(boundary, 0, localSize[0] * sizeof(int));
+    hsize_t sizes[1] = {0};
+    hsize_t start[1] = {0};
+    hsize_t count[1] = {0};
+    sizes[0] = globalSize[0];
+    hid_t h5space = H5Screate_simple(1, sizes, 0L);
+    checkH5Err(h5space);
 
-  sizes[0] = globalSize[0];
-  h5space = H5Screate_simple(1, sizes, 0L);
-  checkH5Err(h5space);
+    hid_t boundaryFilter = H5P_DEFAULT;
+    if (applyFilters) {
+      boundaryFilter = checkH5Err(H5Pcreate(H5P_DATASET_CREATE));
+      hsize_t chunk[1] = {std::min(filterChunksize, sizes[0])};
+      checkH5Err(H5Pset_chunk(boundaryFilter, 1, chunk));
+      if (filterEnable == 1) {
+        checkH5Err(H5Pset_scaleoffset(boundaryFilter, H5Z_SO_INT, H5Z_SO_INT_MINBITS_DEFAULT));
+      }
+      else if (filterEnable < 11) {
+        int deflateStrength = filterEnable - 1;
+        checkH5Err(H5Pset_deflate(boundaryFilter, deflateStrength));
+      }
+    }
+
+    hid_t h5boundary =
+        H5Dcreate(h5file, "/boundary", H5T_STD_I32LE, h5space, H5P_DEFAULT, boundaryFilter, H5P_DEFAULT);
+    checkH5Err(h5boundary);
 
-  hid_t h5boundary =
-      H5Dcreate(h5file, "/boundary", H5T_STD_I32LE, h5space, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-  checkH5Err(h5boundary);
+    start[0] = offsets[0];
+    count[0] = localSize[0];
+    checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
 
-  start[0] = offsets[0];
-  count[0] = localSize[0];
-  checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
+    sizes[0] = localSize[0];
+    hid_t h5memspace = H5Screate_simple(1, sizes, 0L);
+    checkH5Err(h5memspace);
 
-  sizes[0] = localSize[0];
-  h5memspace = H5Screate_simple(1, sizes, 0L);
-  checkH5Err(h5memspace);
+    std::vector<int> boundary(localSize[0]);
 
-  it = mesh->begin(3);
-  index = 0;
-  while (apf::MeshEntity* element = mesh->iterate(it)) {
-    apf::Downward faces;
-    mesh->getDownward(element, 2, faces);
+    it = mesh->begin(3);
+    std::size_t index = 0;
+    while (apf::MeshEntity* element = mesh->iterate(it)) {
+      apf::Downward faces;
+      mesh->getDownward(element, 2, faces);
 
-    for (unsigned int i = 0; i < 4; i++) {
-      if (mesh->hasTag(faces[i], boundaryTag)) {
-        int b;
-        mesh->getIntTag(faces[i], boundaryTag, &b);
+      for (int i = 0; i < 4; i++) {
+        if (mesh->hasTag(faces[i], boundaryTag)) {
+          int b;
+          mesh->getIntTag(faces[i], boundaryTag, &b);
 
-        if (b <= 0 || b > std::numeric_limits<unsigned char>::max())
-          logError() << "Cannot handle boundary condition" << b;
+          if (b <= 0 || b > std::numeric_limits<unsigned char>::max())
+            logError() << "Cannot handle boundary condition" << b;
 
-        boundary[index] += b << (i * 8);
+          boundary[index] += b << (i * 8);
+        }
       }
-    }
 
-    index++;
-  }
-  mesh->end(it);
-
-  checkH5Err(H5Dwrite(h5boundary, H5T_NATIVE_INT, h5memspace, h5space, h5dxlist, boundary));
+      index++;
+    }
+    mesh->end(it);
 
-  checkH5Err(H5Sclose(h5space));
-  checkH5Err(H5Sclose(h5memspace));
-  checkH5Err(H5Dclose(h5boundary));
+    checkH5Err(H5Dwrite(h5boundary, H5T_NATIVE_INT, h5memspace, h5space, h5dxlist, boundary.data()));
 
-  delete[] boundary;
+    if (boundaryFilter) {
+      checkH5Err(H5Pclose(boundaryFilter));
+    }
+    checkH5Err(H5Sclose(h5space));
+    checkH5Err(H5Sclose(h5memspace));
+    checkH5Err(H5Dclose(h5boundary));
+  }
 
   checkH5Err(H5Pclose(h5dxlist));
 
-  delete meshInput;
-
   // Writing XDMF file
   if (rank == 0) {
     logInfo() << "Writing XDMF file";
@@ -451,7 +629,7 @@ int main(int argc, char* argv[]) {
          << std::endl
          // This should be UInt but for some reason this does not work with
          // binary data
-         << "    <DataItem NumberType=\"Int\" Precision=\"8\" Format=\"HDF\" "
+         << "    <DataItem NumberType=\"Int\" Precision=\""<<connectSize<<"\" Format=\"HDF\" "
             "Dimensions=\""
          << globalSize[0] << " 4\">" << basename << ":/connect</DataItem>" << std::endl
          << "   </Topology>" << std::endl
@@ -462,7 +640,7 @@ int main(int argc, char* argv[]) {
          << ":/geometry</DataItem>" << std::endl
          << "   </Geometry>" << std::endl
          << "   <Attribute Name=\"group\" Center=\"Cell\">" << std::endl
-         << "    <DataItem  NumberType=\"Int\" Precision=\"4\" Format=\"HDF\" "
+         << "    <DataItem  NumberType=\"Int\" Precision=\""<<groupSize<<"\" Format=\"HDF\" "
             "Dimensions=\""
          << globalSize[0] << "\">" << basename << ":/group</DataItem>" << std::endl
          << "   </Attribute>" << std::endl
@@ -478,6 +656,10 @@ int main(int argc, char* argv[]) {
 
   checkH5Err(H5Fclose(h5file));
 
+  delete sharing;
+  delete mesh;
+  delete meshInput;
+
   logInfo(rank) << "Finished successfully";
 
   PCU_Comm_Free();

From 5cd8d93f6e6d0b1cda0f5ffda93831702a0e2dcc Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Wed, 20 Sep 2023 23:46:37 +0200
Subject: [PATCH 2/8] Upgrade C++, add std::size_t everywhere

---
 src/input/NetCDFMesh.h                |  40 +++---
 src/input/NetCDFPartition.h           |  12 +-
 src/input/ParallelVertexFilter.h      | 185 +++++++++++++-------------
 src/input/SerialMeshFile.h            |  16 +--
 src/meshreader/FidapReader.h          |  75 ++++++-----
 src/meshreader/GMSHBuilder.h          |   2 +-
 src/meshreader/GambitReader.h         | 117 ++++++++--------
 src/meshreader/MeshReader.h           |  18 +--
 src/meshreader/ParallelFidapReader.h  |  96 ++++++-------
 src/meshreader/ParallelGMSHReader.h   |  24 ++--
 src/meshreader/ParallelGambitReader.h | 128 ++++++++----------
 src/meshreader/ParallelMeshReader.h   |  45 ++++---
 12 files changed, 363 insertions(+), 395 deletions(-)

diff --git a/src/input/NetCDFMesh.h b/src/input/NetCDFMesh.h
index 9f487e7..934b6fb 100644
--- a/src/input/NetCDFMesh.h
+++ b/src/input/NetCDFMesh.h
@@ -55,10 +55,10 @@ class NetCDFMesh : public MeshInput {
     checkNcError(nc_inq_dimlen(ncFile, ncDimPart, &nPartitions));
 
     // Local partitions
-    unsigned int nMaxLocalPart = (nPartitions + nProcs - 1) / nProcs;
-    unsigned int nLocalPart = nMaxLocalPart;
-    if (nPartitions < (rank + 1) * nMaxLocalPart) {
-      nLocalPart = std::max(0, static_cast<int>(nPartitions - rank * nMaxLocalPart));
+    const std::size_t nMaxLocalPart = (nPartitions + nProcs - 1) / nProcs;
+    std::size_t nLocalPart = nMaxLocalPart;
+    if (nPartitions < (rank + 1) * nMaxLocalPart && nPartitions >= rank * nMaxLocalPart) {
+      nLocalPart = static_cast<std::size_t>(nPartitions - rank * nMaxLocalPart);
     }
 
     MPI_Comm commIO;
@@ -73,8 +73,8 @@ class NetCDFMesh : public MeshInput {
 
     PCU_Switch_Comm(commIO);
 
-    unsigned int nElements = 0;
-    unsigned int nVertices = 0;
+    std::size_t nElements = 0;
+    std::size_t nVertices = 0;
     std::vector<ElementID> elements;
     std::vector<double> vertices;
     std::vector<int> boundaries;
@@ -115,8 +115,10 @@ class NetCDFMesh : public MeshInput {
 
       // Read elements
       logInfo(rank) << "Reading netCDF file";
-      for (unsigned int i = 0; i < nMaxLocalPart; i++) {
-        unsigned int j = i % nLocalPart;
+      for (std::size_t i = 0; i < nMaxLocalPart; i++) {
+        std::size_t j = i % nLocalPart;
+
+        // for now, each partition stays limited to about 2^31 maximum elements
 
         size_t start[3] = {j + rank * nMaxLocalPart, 0, 0};
 
@@ -152,26 +154,26 @@ class NetCDFMesh : public MeshInput {
 
       checkNcError(nc_close(ncFile));
 
-      for (unsigned int i = 0; i < nLocalPart; i++) {
+      for (std::size_t i = 0; i < nLocalPart; i++) {
         nElements += partitions[i].nElements();
         nVertices += partitions[i].nVertices();
       }
 
       // Copy to the buffer
-      std::vector<unsigned int> elementsLocal(nElements * 4);
+      std::vector<std::size_t> elementsLocal(nElements * 4);
       elements.resize(nElements * 4);
       vertices.resize(nVertices * 3);
 
       boundaries.resize(nElements * 4);
       groups.resize(nElements);
 
-      unsigned int elementOffset = 0;
-      unsigned int vertexOffset = 0;
-      for (unsigned int i = 0; i < nLocalPart; i++) {
+      std::size_t elementOffset = 0;
+      std::size_t vertexOffset = 0;
+      for (std::size_t i = 0; i < nLocalPart; i++) {
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel schedule(static)
 #endif
-        for (unsigned int j = 0; j < partitions[i].nElements() * 4; j++) {
+        for (std::size_t j = 0; j < partitions[i].nElements() * 4; j++) {
           elementsLocal[elementOffset * 4 + j] = partitions[i].elements()[j] + vertexOffset;
         }
 
@@ -190,7 +192,7 @@ class NetCDFMesh : public MeshInput {
 
       logInfo(rank) << "Running vertex filter";
       ParallelVertexFilter filter(commIO);
-      filter.filter(nVertices, vertices.data());
+      filter.filter(nVertices, vertices);
 
       // Create filtered vertex list
 
@@ -202,7 +204,7 @@ class NetCDFMesh : public MeshInput {
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
-      for (unsigned int i = 0; i < nElements * 4; i++) {
+      for (std::size_t i = 0; i < nElements * 4; i++) {
         elements[i] = filter.globalIds()[elementsLocal[i]];
       }
     }
@@ -220,12 +222,12 @@ class NetCDFMesh : public MeshInput {
     // Set boundaries
     apf::MeshTag* boundaryTag = m_mesh->createIntTag("boundary condition", 1);
     apf::MeshIterator* it = m_mesh->begin(3);
-    unsigned int i = 0;
+    std::size_t i = 0;
     while (apf::MeshEntity* element = m_mesh->iterate(it)) {
       apf::Adjacent adjacent;
       m_mesh->getAdjacent(element, 2, adjacent);
 
-      for (unsigned int j = 0; j < 4; j++) {
+      for (int j = 0; j < 4; j++) {
         if (!boundaries[i * 4 + j])
           continue;
 
diff --git a/src/input/NetCDFPartition.h b/src/input/NetCDFPartition.h
index 57e76ef..71953d1 100644
--- a/src/input/NetCDFPartition.h
+++ b/src/input/NetCDFPartition.h
@@ -20,8 +20,8 @@
  */
 class Partition {
   private:
-  unsigned int m_nElements;
-  unsigned int m_nVertices;
+  std::size_t m_nElements;
+  std::size_t m_nVertices;
 
   int* m_elements;
   double* m_vertices;
@@ -40,7 +40,7 @@ class Partition {
     delete[] m_groups;
   }
 
-  void setElemSize(unsigned int nElements) {
+  void setElemSize(std::size_t nElements) {
     if (m_nElements != 0)
       return;
 
@@ -53,7 +53,7 @@ class Partition {
     memset(m_groups, 0, nElements * sizeof(int));
   }
 
-  void setVrtxSize(unsigned int nVertices) {
+  void setVrtxSize(std::size_t nVertices) {
     if (m_nVertices != 0)
       return;
 
@@ -72,9 +72,9 @@ class Partition {
     }
   }
 
-  unsigned int nElements() const { return m_nElements; }
+  std::size_t nElements() const { return m_nElements; }
 
-  unsigned int nVertices() const { return m_nVertices; }
+  std::size_t nVertices() const { return m_nVertices; }
 
   int* elements() { return m_elements; }
 
diff --git a/src/input/ParallelVertexFilter.h b/src/input/ParallelVertexFilter.h
index 870b271..fdacf8b 100644
--- a/src/input/ParallelVertexFilter.h
+++ b/src/input/ParallelVertexFilter.h
@@ -22,10 +22,14 @@
 #include <cassert>
 #include <cstdint>
 #include <cstring>
+#include <numeric>
 #include <vector>
+#include <array>
 
 #include "utils/logger.h"
 
+#include "third_party/MPITraits.h"
+
 /**
  * Filters duplicate vertices in parallel
  */
@@ -36,12 +40,12 @@ class ParallelVertexFilter {
    */
   class IndexedVertexComparator {
 private:
-    const double* m_vertices;
+    const std::vector<double>& m_vertices;
 
 public:
-    IndexedVertexComparator(const double* vertices) : m_vertices(vertices) {}
+    IndexedVertexComparator(const std::vector<double>& vertices) : m_vertices(vertices) {}
 
-    bool operator()(unsigned int i, unsigned int j) {
+    bool operator()(std::size_t i, std::size_t j) {
       i *= 3;
       j *= 3;
 
@@ -63,17 +67,17 @@ class ParallelVertexFilter {
   int m_numProcs;
 
   /** Global id after filtering */
-  unsigned long* m_globalIds;
+  std::vector<std::size_t> m_globalIds;
 
   /** Number of local vertices after filtering */
-  unsigned int m_numLocalVertices;
+  std::size_t m_numLocalVertices;
 
   /** Local vertices after filtering */
-  double* m_localVertices;
+  std::vector<double> m_localVertices;
 
   public:
   ParallelVertexFilter(MPI_Comm comm = MPI_COMM_WORLD)
-      : m_comm(comm), m_globalIds(0L), m_numLocalVertices(0), m_localVertices(0L) {
+      : m_comm(comm), m_numLocalVertices(0) {
     MPI_Comm_rank(comm, &m_rank);
     MPI_Comm_size(comm, &m_numProcs);
 
@@ -84,25 +88,23 @@ class ParallelVertexFilter {
   }
 
   virtual ~ParallelVertexFilter() {
-    delete[] m_globalIds;
-    delete[] m_localVertices;
   }
 
   /**
    * @param vertices Vertices that should be filtered, must have the size
    * <code>numVertices * 3</code>
    */
-  void filter(unsigned int numVertices, const double* vertices) {
+  void filter(std::size_t numVertices, const std::vector<double>& vertices) {
     // Chop the last 4 bits to avoid numerical errors
-    double* roundVertices = new double[numVertices * 3];
-    removeRoundError(vertices, numVertices * 3, roundVertices);
+    std::vector<double> roundVertices (numVertices * 3);
+    removeRoundError(vertices, roundVertices);
 
     // Create indices and sort them locally
-    unsigned int* sortIndices = new unsigned int[numVertices];
-    createSortedIndices(roundVertices, numVertices, sortIndices);
+    std::vector<std::size_t> sortIndices(numVertices);
+    createSortedIndices(roundVertices, sortIndices);
 
     // Select BUCKETS_PER_RANK-1 splitter elements
-    double localSplitters[BUCKETS_PER_RANK - 1];
+    std::array<double, BUCKETS_PER_RANK - 1> localSplitters;
 #if 0 // Use omp only if we create a larger amount of buckets
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
@@ -118,20 +120,22 @@ class ParallelVertexFilter {
     }
 
     // Collect all splitter elements on rank 0
-    double* allSplitters = 0L;
+    std::vector<double> allSplitters;
 
-    if (m_rank == 0)
-      allSplitters = new double[m_numProcs * (BUCKETS_PER_RANK - 1)];
+    if (m_rank == 0) {
+      allSplitters.resize(m_numProcs * (BUCKETS_PER_RANK - 1));
+    }
 
-    MPI_Gather(localSplitters, BUCKETS_PER_RANK - 1, MPI_DOUBLE, allSplitters, BUCKETS_PER_RANK - 1,
+    MPI_Gather(localSplitters.data(), BUCKETS_PER_RANK - 1, MPI_DOUBLE, allSplitters.data(), BUCKETS_PER_RANK - 1,
                MPI_DOUBLE, 0, m_comm);
 
     // Sort splitter elements
-    if (m_rank == 0)
-      std::sort(allSplitters, allSplitters + (m_numProcs * (BUCKETS_PER_RANK - 1)));
+    if (m_rank == 0) {
+      std::sort(allSplitters.begin(), allSplitters.end());
+    }
 
     // Distribute splitter to all processes
-    double* splitters = new double[m_numProcs - 1];
+    std::vector<double> splitters (m_numProcs - 1);
 
     if (m_rank == 0) {
 #ifdef _OPENMP
@@ -145,157 +149,142 @@ class ParallelVertexFilter {
       }
     }
 
-    MPI_Bcast(splitters, m_numProcs - 1, MPI_DOUBLE, 0, m_comm);
-
-    delete[] allSplitters;
+    MPI_Bcast(splitters.data(), m_numProcs - 1, MPI_DOUBLE, 0, m_comm);
 
     // Determine the bucket for each vertex
-    unsigned int* bucket = new unsigned int[numVertices];
+    std::vector<std::size_t> bucket (numVertices);
 
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (unsigned int i = 0; i < numVertices; i++) {
-      double* ub = std::upper_bound(splitters, splitters + m_numProcs - 1, roundVertices[i * 3]);
+    for (std::size_t i = 0; i < numVertices; i++) {
+      auto ub = std::upper_bound(splitters.begin(), splitters.end(), roundVertices[i * 3]);
 
-      bucket[i] = ub - splitters;
+      bucket[i] = std::distance(splitters.begin(), ub);
     }
 
-    delete[] roundVertices;
-    delete[] splitters;
-
     // Determine the (local and total) bucket size
-    int* bucketSize = new int[m_numProcs];
-    memset(bucketSize, 0, sizeof(int) * m_numProcs);
-    for (unsigned int i = 0; i < numVertices; i++)
-      bucketSize[bucket[i]]++;
-
-    delete[] bucket;
+    std::vector<int> bucketSize (m_numProcs);
+    for (std::size_t i = 0; i < numVertices; i++) {
+      ++bucketSize[bucket[i]];
+    }
 
     // Tell all processes what we are going to send them
-    int* recvSize = new int[m_numProcs];
+    std::vector<int> recvSize(m_numProcs);
 
-    MPI_Alltoall(bucketSize, 1, MPI_INT, recvSize, 1, MPI_INT, m_comm);
+    MPI_Alltoall(bucketSize.data(), 1, MPI_INT, recvSize.data(), 1, MPI_INT, m_comm);
 
-    unsigned int numSortVertices = 0;
+    std::size_t numSortVertices = 0;
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static) reduction(+ : numSortVertices)
 #endif
-    for (int i = 0; i < m_numProcs; i++)
+    for (int i = 0; i < m_numProcs; i++) {
       numSortVertices += recvSize[i];
+    }
 
     // Create sorted send buffer
-    double* sendVertices = new double[3 * numVertices];
+    std::vector<double> sendVertices (3 * numVertices);
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (unsigned int i = 0; i < numVertices; i++) {
-      memcpy(&sendVertices[i * 3], &vertices[sortIndices[i] * 3], sizeof(double) * 3);
+    for (std::size_t i = 0; i < numVertices; i++) {
+      std::copy_n(vertices.begin() + sortIndices[i] * 3, 3, sendVertices.begin() + i * 3);
     }
 
     // Allocate buffer for the vertices and exchange them
-    double* sortVertices = new double[3 * numSortVertices];
+    std::vector<double> sortVertices (3 * numSortVertices);
 
-    int* sDispls = new int[m_numProcs];
-    int* rDispls = new int[m_numProcs];
+    std::vector<int> sDispls (m_numProcs);
+    std::vector<int> rDispls (m_numProcs);
     sDispls[0] = 0;
     rDispls[0] = 0;
     for (int i = 1; i < m_numProcs; i++) {
       sDispls[i] = sDispls[i - 1] + bucketSize[i - 1];
       rDispls[i] = rDispls[i - 1] + recvSize[i - 1];
     }
-    MPI_Alltoallv(sendVertices, bucketSize, sDispls, vertexType, sortVertices, recvSize, rDispls,
+    MPI_Alltoallv(sendVertices.data(), bucketSize.data(), sDispls.data(), vertexType, sortVertices.data(), recvSize.data(), rDispls.data(),
                   vertexType, m_comm);
 
-    delete[] sendVertices;
-
     // Chop the last 4 bits to avoid numerical errors
-    roundVertices = new double[numSortVertices * 3];
-    removeRoundError(sortVertices, numSortVertices * 3, roundVertices);
+    roundVertices.resize(numSortVertices * 3);
+    removeRoundError(sortVertices, roundVertices);
 
     // Create indices and sort them (such that the vertices are sorted)
-    unsigned int* sortSortIndices = new unsigned int[numSortVertices];
-    createSortedIndices(roundVertices, numSortVertices, sortSortIndices);
-
-    delete[] roundVertices;
+    std::vector<std::size_t> sortSortIndices (numSortVertices);
+    createSortedIndices(roundVertices, sortSortIndices);
 
     // Initialize the global ids we send back to the other processors
-    unsigned long* gids = new unsigned long[numSortVertices];
+    std::vector<std::size_t> gids (numSortVertices);
 
     if (numSortVertices > 0) {
       gids[sortSortIndices[0]] = 0;
-      for (unsigned int i = 1; i < numSortVertices; i++) {
+      for (std::size_t i = 1; i < numSortVertices; i++) {
         if (equals(&sortVertices[sortSortIndices[i - 1] * 3],
-                   &sortVertices[sortSortIndices[i] * 3]))
+                   &sortVertices[sortSortIndices[i] * 3])) {
           gids[sortSortIndices[i]] = gids[sortSortIndices[i - 1]];
-        else
+                   }
+        else {
           gids[sortSortIndices[i]] = gids[sortSortIndices[i - 1]] + 1;
+        }
       }
     }
 
     // Create the local vertices list
-    if (numSortVertices > 0)
+    if (numSortVertices > 0) {
       m_numLocalVertices = gids[sortSortIndices[numSortVertices - 1]] + 1;
-    else
+    }
+    else {
       m_numLocalVertices = 0;
-    delete[] m_localVertices;
-    m_localVertices = new double[m_numLocalVertices * 3];
-    for (unsigned int i = 0; i < numSortVertices; i++)
-      memcpy(&m_localVertices[gids[i] * 3], &sortVertices[i * 3], sizeof(double) * 3);
+    }
 
-    delete[] sortVertices;
+    m_localVertices.resize(m_numLocalVertices * 3);
+    for (std::size_t i = 0; i < numSortVertices; i++) {
+      std::copy_n(sortVertices.begin() + i * 3, 3, m_localVertices.begin() + gids[i] * 3);
+    }
 
     // Get the vertices offset
-    unsigned int offset = m_numLocalVertices;
-    MPI_Scan(MPI_IN_PLACE, &offset, 1, MPI_UNSIGNED, MPI_SUM, m_comm);
+    std::size_t offset = m_numLocalVertices;
+    MPI_Scan(MPI_IN_PLACE, &offset, 1, tndm::mpi_type_t<std::size_t>(), MPI_SUM, m_comm);
     offset -= m_numLocalVertices;
 
     // Add offset to the global ids
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (unsigned int i = 0; i < numSortVertices; i++)
+    for (std::size_t i = 0; i < numSortVertices; i++) {
       gids[i] += offset;
+    }
 
     // Send result back
-    unsigned long* globalIds = new unsigned long[numVertices];
-    MPI_Alltoallv(gids, recvSize, rDispls, MPI_UNSIGNED_LONG, globalIds, bucketSize, sDispls,
-                  MPI_UNSIGNED_LONG, m_comm);
-
-    delete[] bucketSize;
-    delete[] recvSize;
-    delete[] sDispls;
-    delete[] rDispls;
-    delete[] gids;
+    std::vector<std::size_t> globalIds (numVertices);
+    MPI_Alltoallv(gids.data(), recvSize.data(), rDispls.data(), tndm::mpi_type_t<std::size_t>(), globalIds.data(), bucketSize.data(), sDispls.data(),
+                  tndm::mpi_type_t<std::size_t>(), m_comm);
 
     // Assign the global ids to the correct vertices
-    delete[] m_globalIds;
-    m_globalIds = new unsigned long[numVertices];
+    m_globalIds.resize(numVertices);
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (unsigned int i = 0; i < numVertices; i++)
+    for (std::size_t i = 0; i < numVertices; i++) {
       m_globalIds[sortIndices[i]] = globalIds[i];
-
-    delete[] sortIndices;
-    delete[] globalIds;
+    }
   }
 
   /**
    * @return The list of the global identifiers after filtering
    */
-  const unsigned long* globalIds() const { return m_globalIds; }
+  const std::vector<std::size_t>& globalIds() const { return m_globalIds; }
 
   /**
    * @return Number of vertices this process is responsible for after filtering
    */
-  unsigned int numLocalVertices() const { return m_numLocalVertices; }
+  std::size_t numLocalVertices() const { return m_numLocalVertices; }
 
   /**
    * @return The list of vertices this process is responsible for after
    * filtering
    */
-  const double* localVertices() const { return m_localVertices; }
+  const std::vector<double>& localVertices() const { return m_localVertices; }
 
   private:
   /**
@@ -329,29 +318,33 @@ class ParallelVertexFilter {
    * @param[out] roundValues The list of rounded values
    *  (the caller is responsible for allocating the memory)
    */
-  static void removeRoundError(const double* values, unsigned int count, double* roundValues) {
+  static void removeRoundError(const std::vector<double>& values, std::vector<double>& roundValues) {
+    assert(values.size() == roundValues.size());
+
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (unsigned int i = 0; i < count; i++)
+    for (std::size_t i = 0; i < roundValues.size(); i++) {
       roundValues[i] = removeRoundError(values[i]);
+    }
   }
 
   /**
    * Creates the list of sorted indices for the vertices.
    * The caller is responsible for allocating the memory.
    */
-  static void createSortedIndices(const double* vertices, unsigned int numVertices,
-                                  unsigned int* sortedIndices) {
+  static void createSortedIndices(const std::vector<double>& vertices,
+                                  std::vector<std::size_t>& sortedIndices) {
 
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (unsigned int i = 0; i < numVertices; i++)
+    for (std::size_t i = 0; i < sortedIndices.size(); i++) {
       sortedIndices[i] = i;
+    }
 
     IndexedVertexComparator comparator(vertices);
-    std::sort(sortedIndices, sortedIndices + numVertices, comparator);
+    std::sort(sortedIndices.begin(), sortedIndices.end(), comparator);
   }
 
   /**
@@ -366,7 +359,7 @@ class ParallelVertexFilter {
   static MPI_Datatype vertexType;
 
   /** The total buckets we create is <code>BUCKETS_PER_RANK * numProcs</code> */
-  const static int BUCKETS_PER_RANK = 8;
+  constexpr static int BUCKETS_PER_RANK = 8;
 };
 
 #endif // PARALLEL_VERTEX_FILTER_H
diff --git a/src/input/SerialMeshFile.h b/src/input/SerialMeshFile.h
index 25b8c45..5377314 100644
--- a/src/input/SerialMeshFile.h
+++ b/src/input/SerialMeshFile.h
@@ -71,10 +71,10 @@ template <typename T> class SerialMeshFile : public MeshInput {
   void open(const char* meshFile) {
     m_meshReader.open(meshFile);
 
-    unsigned int nVertices = m_meshReader.nVertices();
-    unsigned int nElements = m_meshReader.nElements();
-    unsigned int nLocalVertices = (nVertices + m_nProcs - 1) / m_nProcs;
-    unsigned int nLocalElements = (nElements + m_nProcs - 1) / m_nProcs;
+    const std::size_t nVertices = m_meshReader.nVertices();
+    const std::size_t nElements = m_meshReader.nElements();
+    std::size_t nLocalVertices = (nVertices + m_nProcs - 1) / m_nProcs;
+    std::size_t nLocalElements = (nElements + m_nProcs - 1) / m_nProcs;
     if (m_rank == m_nProcs - 1) {
       nLocalVertices = nVertices - (m_nProcs - 1) * nLocalVertices;
       nLocalElements = nElements - (m_nProcs - 1) * nLocalElements;
@@ -89,7 +89,7 @@ template <typename T> class SerialMeshFile : public MeshInput {
     {
       std::vector<ElementID> elements;
       {
-        std::vector<int> elementsInt(nLocalElements * 4);
+        std::vector<std::size_t> elementsInt(nLocalElements * 4);
         m_meshReader.readElements(elementsInt.data());
         elements = std::vector<ElementID>(elementsInt.begin(), elementsInt.end());
       }
@@ -114,12 +114,12 @@ template <typename T> class SerialMeshFile : public MeshInput {
       std::vector<int> boundaries(nLocalElements * 4);
       m_meshReader.readBoundaries(boundaries.data());
       apf::MeshIterator* it = m_mesh->begin(3);
-      unsigned int i = 0;
+      std::size_t i = 0;
       while (apf::MeshEntity* element = m_mesh->iterate(it)) {
         apf::Adjacent adjacent;
         m_mesh->getAdjacent(element, 2, adjacent);
 
-        for (unsigned int j = 0; j < 4; j++) {
+        for (int j = 0; j < 4; j++) {
           if (!boundaries[i * 4 + j]) {
             continue;
           }
@@ -138,7 +138,7 @@ template <typename T> class SerialMeshFile : public MeshInput {
       std::vector<int> groups(nLocalElements);
       m_meshReader.readGroups(groups.data());
       apf::MeshIterator* it = m_mesh->begin(3);
-      unsigned int i = 0;
+      std::size_t i = 0;
       while (apf::MeshEntity* element = m_mesh->iterate(it)) {
         m_mesh->setIntTag(element, groupTag, &groups[i]);
         i++;
diff --git a/src/meshreader/FidapReader.h b/src/meshreader/FidapReader.h
index f071ff2..4fa99c8 100644
--- a/src/meshreader/FidapReader.h
+++ b/src/meshreader/FidapReader.h
@@ -18,6 +18,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <array>
 
 #include "utils/stringutils.h"
 
@@ -25,7 +26,7 @@
 
 struct FidapBoundaryFace {
   /** The vertices of the face */
-  int vertices[3];
+  std::array<std::size_t, 3> vertices;
   /** The type of the boundary */
   int type;
 };
@@ -34,16 +35,16 @@ struct FidapBoundaryFace {
 struct FaceVertex {
   FaceVertex() {}
 
-  FaceVertex(int v1, int v2, int v3) {
+  FaceVertex(std::size_t v1, std::size_t v2, std::size_t v3) {
     vertices[0] = v1;
     vertices[1] = v2;
     vertices[2] = v3;
-    std::sort(vertices, vertices + 3);
+    std::sort(vertices.begin(), vertices.end());
   }
 
-  FaceVertex(const int v[3]) {
-    memcpy(vertices, v, sizeof(vertices));
-    std::sort(vertices, vertices + 3);
+  FaceVertex(const std::array<std::size_t, 3>& v) {
+    std::copy(v.begin(), v.end(), vertices.begin());
+    std::sort(vertices.begin(), vertices.end());
   }
 
   bool operator<(const FaceVertex& other) const {
@@ -53,7 +54,7 @@ struct FaceVertex {
             vertices[2] < other.vertices[2]);
   }
 
-  int vertices[3];
+  std::array<std::size_t, 3> vertices;
 };
 
 /** Defines a face by element and face side */
@@ -63,12 +64,12 @@ struct FaceElement {
     side = -1;
   }
 
-  FaceElement(int e, int s) {
+  FaceElement(std::size_t e, int s) {
     element = e;
     side = s;
   }
 
-  int element;
+  std::size_t element;
   int side;
 };
 
@@ -109,7 +110,7 @@ class FidapReader : public MeshReader {
     getline(m_mesh, line); // Date
     getline(m_mesh, line); // Empty line
 
-    unsigned int nElements, nZones, t, dimensions;
+    std::size_t nElements, nZones, t, dimensions;
     m_mesh >> m_vertices.nLines;
     m_mesh >> nElements;
     m_mesh >> nZones;
@@ -147,10 +148,10 @@ class FidapReader : public MeshReader {
     if (line.find(ELEMENT_GROUPS) == std::string::npos)
       logError() << "Invalid Fidap format: Element groups expected, found" << line;
 
-    for (unsigned int i = 0; i < nZones; i++) {
+    for (std::size_t i = 0; i < nZones; i++) {
       FileSection sec;
 
-      // Group
+      // Group (unused)
       m_mesh >> name;
       if (name.find(ZONE_GROUP) == std::string::npos)
         logError() << "Invalid Fidap format: Expected group, found" << name;
@@ -167,13 +168,13 @@ class FidapReader : public MeshReader {
         logError() << "Invalid Fidap format: Expected nodes, found" << name;
       unsigned int nodeType;
       m_mesh >> nodeType;
-      // Geometry
+      // Geometry (unused)
       m_mesh >> name;
       if (name.find(ZONE_GEOMETRY) == std::string::npos)
         logError() << "Invalid Fidap format: Expected geometry, found" << name;
       unsigned int geoType;
       m_mesh >> geoType;
-      // Type
+      // Type (unused)
       m_mesh >> name;
       if (name.find(ZONE_TYPE) == std::string::npos)
         logError() << "Invalid Fidap format: Expected type, found" << name;
@@ -223,35 +224,33 @@ class FidapReader : public MeshReader {
     }
   }
 
-  unsigned int nVertices() const { return m_vertices.nLines; }
+  std::size_t nVertices() const { return m_vertices.nLines; }
 
-  unsigned int nElements() const {
-    unsigned int count = 0;
-    for (std::vector<ElementSection>::const_iterator i = m_elements.begin(); i != m_elements.end();
-         i++) {
-      count += i->nLines;
+  std::size_t nElements() const {
+    std::size_t count = 0;
+    for (const auto& element : m_elements) {
+      count += element.nLines;
     }
 
     return count;
   }
 
-  unsigned int nBoundaries() const {
-    unsigned int count = 0;
-    for (std::vector<BoundarySection>::const_iterator i = m_boundaries.begin();
-         i != m_boundaries.end(); i++) {
-      count += i->nLines;
+  std::size_t nBoundaries() const {
+    std::size_t count = 0;
+    for (const auto& boundary : m_boundaries) {
+      count += boundary.nLines;
     }
 
     return count;
   }
 
-  void readVertices(unsigned int start, unsigned int count, double* vertices) {
+  void readVertices(std::size_t start, std::size_t count, double* vertices) {
     m_mesh.clear();
 
     m_mesh.seekg(m_vertices.seekPosition + start * m_vertices.lineSize + m_vertices.lineSize -
                  3 * COORDINATE_SIZE - 1);
 
-    for (unsigned int i = 0; i < count; i++) {
+    for (std::size_t i = 0; i < count; i++) {
       for (int j = 0; j < 3; j++)
         m_mesh >> vertices[i * 3 + j];
 
@@ -259,7 +258,7 @@ class FidapReader : public MeshReader {
     }
   }
 
-  void readElements(unsigned int start, unsigned int count, int* elements) {
+  void readElements(std::size_t start, std::size_t count, std::size_t* elements) {
     m_mesh.clear();
 
     // Get the boundary, were we should start reading
@@ -271,7 +270,7 @@ class FidapReader : public MeshReader {
     m_mesh.seekg(section->seekPosition + start * section->lineSize + section->lineSize -
                  4 * VERTEX_ID_SIZE - 1);
 
-    for (unsigned int i = 0; i < count; i++) {
+    for (std::size_t i = 0; i < count; i++) {
       if (start >= section->nLines) {
         // Are we starting a new section in this iteration?
         start = 0;
@@ -280,7 +279,7 @@ class FidapReader : public MeshReader {
         m_mesh.seekg(section->seekPosition + section->lineSize - 4 * VERTEX_ID_SIZE - 1);
       }
 
-      for (unsigned int j = 0; j < 4; j++) {
+      for (int j = 0; j < 4; j++) {
         m_mesh >> elements[i * 4 + j];
         elements[i * 4 + j]--;
       }
@@ -295,14 +294,14 @@ class FidapReader : public MeshReader {
    * Read group ids from start to start+count. The group ids are sorted
    * in the same order as the elements.
    */
-  void readGroups(unsigned int start, unsigned int count, int* groups) {
+  void readGroups(std::size_t start, std::size_t count, int* groups) {
     // Get the boundary, were we should start reading
     std::vector<ElementSection>::const_iterator section;
     for (section = m_elements.begin(); section != m_elements.end() && section->nLines < start;
          section++)
       start -= section->nLines;
 
-    for (unsigned int i = 0; i < count; i++) {
+    for (std::size_t i = 0; i < count; i++) {
       if (start >= section->nLines) {
         // Are we starting a new section in this iteration?
         start = 0;
@@ -320,7 +319,7 @@ class FidapReader : public MeshReader {
    */
   void readGroups(int* groups) { readGroups(0, nElements(), groups); }
 
-  void readBoundaries(unsigned int start, unsigned int count, FidapBoundaryFace* boundaries) {
+  void readBoundaries(std::size_t start, std::size_t count, FidapBoundaryFace* boundaries) {
     m_mesh.clear();
 
     // Get the boundary, were we should start reading
@@ -332,7 +331,7 @@ class FidapReader : public MeshReader {
     m_mesh.seekg(section->seekPosition + start * section->lineSize + section->lineSize -
                  3 * VERTEX_ID_SIZE - 1);
 
-    for (unsigned int i = 0; i < count; i++) {
+    for (std::size_t i = 0; i < count; i++) {
       if (start >= section->nLines) {
         // Are we starting a new section in this iteration?
         start = 0;
@@ -341,7 +340,7 @@ class FidapReader : public MeshReader {
         m_mesh.seekg(section->seekPosition + section->lineSize - 3 * VERTEX_ID_SIZE - 1);
       }
 
-      for (unsigned int j = 0; j < 3; j++) {
+      for (int j = 0; j < 3; j++) {
         m_mesh >> boundaries[i].vertices[j];
         boundaries[i].vertices[j]--;
       }
@@ -354,7 +353,7 @@ class FidapReader : public MeshReader {
     }
   }
 
-  // TODO void readBoundaries(int* boundaries) {}
+  // TODO void readBoundaries(std::size_t* boundaries) {}
 
   public:
   /**
@@ -364,9 +363,9 @@ class FidapReader : public MeshReader {
    * @param n Number of elements
    * @param[out] map The map
    */
-  static void createFaceMap(const int* elements, unsigned int n,
+  static void createFaceMap(const std::size_t* elements, std::size_t n,
                             std::map<FaceVertex, FaceElement>& map) {
-    for (unsigned int i = 0; i < n; i++) {
+    for (std::size_t i = 0; i < n; i++) {
       FaceVertex v1(elements[i * 4], elements[i * 4 + 1], elements[i * 4 + 2]);
       FaceElement e1(i, 0);
       map[v1] = e1;
diff --git a/src/meshreader/GMSHBuilder.h b/src/meshreader/GMSHBuilder.h
index 02e0322..c4ce0f7 100644
--- a/src/meshreader/GMSHBuilder.h
+++ b/src/meshreader/GMSHBuilder.h
@@ -18,7 +18,7 @@ template <> struct GMSHSimplexType<3u> { static constexpr long type = 4; };  //
 template <std::size_t D> class GMSHBuilder : public tndm::GMSHMeshBuilder {
   public:
   using vertex_t = std::array<double, D>;
-  using element_t = std::array<int, D + 1u>;
+  using element_t = std::array<std::size_t, D + 1u>;
   using facet_t = std::array<int, D>;
 
   std::vector<vertex_t> vertices;
diff --git a/src/meshreader/GambitReader.h b/src/meshreader/GambitReader.h
index e919fd0..057aa1e 100644
--- a/src/meshreader/GambitReader.h
+++ b/src/meshreader/GambitReader.h
@@ -30,7 +30,7 @@ namespace puml {
  */
 struct ElementGroup {
   /** The element */
-  unsigned int element;
+  std::size_t element;
   /** The group for this element */
   int group;
 };
@@ -40,7 +40,7 @@ struct ElementGroup {
  */
 struct GambitBoundaryFace {
   /** The element the face belongs to */
-  unsigned int element;
+  std::size_t element;
   /** The face of the element */
   unsigned int face;
   /** The type of the boundary */
@@ -107,7 +107,7 @@ class GambitReader : public MeshReader {
 
     getline(m_mesh, line); // Skip problem size names
 
-    unsigned int nGroups, nBoundaries, dimensions;
+    std::size_t nGroups, nBoundaries, dimensions;
     m_mesh >> m_vertices.nLines;
     m_mesh >> m_elements.nLines;
     m_mesh >> nGroups;
@@ -166,7 +166,7 @@ class GambitReader : public MeshReader {
 
     // Groups
     m_groups.resize(nGroups);
-    for (unsigned int i = 0; i < nGroups; i++) {
+    for (std::size_t i = 0; i < nGroups; i++) {
       getline(m_mesh, line);
       if (line.find(ELEMENT_GROUP) == std::string::npos)
         logError() << "Invalid Gambit format: Group expected, found" << line;
@@ -213,7 +213,7 @@ class GambitReader : public MeshReader {
 
     // Boundaries
     m_boundaries.resize(nBoundaries);
-    for (unsigned int i = 0; i < nBoundaries; i++) {
+    for (std::size_t i = 0; i < nBoundaries; i++) {
       getline(m_mesh, line);
       if (line.find(BOUNDARY_CONDITIONS) == std::string::npos)
         logError() << "Invalid Gambit format: Boundaries expected, found" << line;
@@ -236,7 +236,7 @@ class GambitReader : public MeshReader {
 
       // Get element size
       std::istringstream ssE(line);
-      unsigned int element, type, face;
+      std::size_t element, type, face;
       ssE >> element;
       m_boundaries[i].elementSize = ssE.tellg();
       ssE >> type;
@@ -278,35 +278,34 @@ class GambitReader : public MeshReader {
     }
   }
 
-  unsigned int nVertices() const { return m_vertices.nLines; }
+  std::size_t nVertices() const { return m_vertices.nLines; }
 
-  unsigned int nElements() const { return m_elements.nLines; }
+  std::size_t nElements() const { return m_elements.nLines; }
 
-  unsigned int nBoundaries() const {
-    unsigned int count = 0;
-    for (std::vector<BoundarySection>::const_iterator i = m_boundaries.begin();
-         i != m_boundaries.end(); i++) {
-      count += i->nLines;
+  std::size_t nBoundaries() const {
+    std::size_t count = 0;
+    for (const auto& boundary : m_boundaries) {
+      count += boundary.nLines;
     }
 
     return count;
   }
 
   /**
-   * @copydoc MeshReader::readVertices(unsigned int, unsigned int, double*)
+   * @copydoc MeshReader::readVertices(size_t, size_t, double*)
    *
    * @todo Only 3 dimensional meshes are supported
    */
-  void readVertices(unsigned int start, unsigned int count, double* vertices) {
+  void readVertices(std::size_t start, std::size_t count, double* vertices) {
     m_mesh.clear();
 
     m_mesh.seekg(m_vertices.seekPosition + start * m_vertices.lineSize + m_vertices.lineSize -
                  3 * COORDINATE_SIZE - 1);
 
-    char* buf = new char[3 * COORDINATE_SIZE];
+    std::vector<char> buf(3* COORDINATE_SIZE);
 
-    for (unsigned int i = 0; i < count; i++) {
-      m_mesh.read(buf, 3 * COORDINATE_SIZE);
+    for (std::size_t i = 0; i < count; i++) {
+      m_mesh.read(buf.data(), 3 * COORDINATE_SIZE);
 
       for (int j = 0; j < 3; j++) {
         std::istringstream ss(std::string(&buf[j * COORDINATE_SIZE], COORDINATE_SIZE));
@@ -315,25 +314,23 @@ class GambitReader : public MeshReader {
 
       m_mesh.seekg(m_vertices.lineSize - 3 * COORDINATE_SIZE, std::fstream::cur);
     }
-
-    delete[] buf;
   }
 
   /**
-   * @copydoc MeshReader::readElements(unsigned int, unsigned int, int*)
+   * @copydoc MeshReader::readElements(size_t, size_t, size_t*)
    *
    * @todo Only tetrahedral meshes are supported
    * @todo Support for varying coordinate/vertexid fields
    */
-  void readElements(unsigned int start, unsigned int count, int* elements) {
+  void readElements(std::size_t start, std::size_t count, std::size_t* elements) {
     m_mesh.clear();
 
     m_mesh.seekg(m_elements.seekPosition + start * m_elements.lineSize + m_elements.vertexStart);
 
-    char* buf = new char[4 * m_elements.vertexSize];
+    std::vector<char> buf(4 * m_elements.vertexSize);
 
-    for (unsigned int i = 0; i < count; i++) {
-      m_mesh.read(buf, 4 * m_elements.vertexSize);
+    for (std::size_t i = 0; i < count; i++) {
+      m_mesh.read(buf.data(), 4 * m_elements.vertexSize);
 
       for (int j = 0; j < 4; j++) {
         std::istringstream ss(std::string(&buf[j * m_elements.vertexSize], m_elements.vertexSize));
@@ -343,8 +340,6 @@ class GambitReader : public MeshReader {
 
       m_mesh.seekg(m_elements.lineSize - 4 * m_elements.vertexSize, std::fstream::cur);
     }
-
-    delete[] buf;
   }
 
   /**
@@ -353,24 +348,25 @@ class GambitReader : public MeshReader {
    * @param groups Buffer for storing the group numbers. The caller is
    * responsible for allocating the buffer.
    */
-  void readGroups(unsigned int start, unsigned int count, ElementGroup* groups) {
+  void readGroups(std::size_t start, std::size_t count, ElementGroup* groups) {
     m_mesh.clear();
 
     // Get the group, were we should start reading
     std::vector<GroupSection>::const_iterator section;
     for (section = m_groups.begin(); section != m_groups.end() && section->nLines < start;
-         section++)
+         section++) {
       start -= section->nLines;
+    }
 
     m_mesh.seekg(section->seekPosition + (start / ELEMENTS_PER_LINE_GROUP) * section->lineSize +
                  (start % ELEMENTS_PER_LINE_GROUP) * section->elementSize);
 
-    char* buf = new char[section->elementSize];
+    std::vector<char> buf(section->elementSize);
 
-    for (unsigned int i = 0; i < count; i++) {
-      m_mesh.read(buf, section->elementSize);
+    for (std::size_t i = 0; i < count; i++) {
+      m_mesh.read(buf.data(), section->elementSize);
 
-      std::istringstream ss(std::string(buf, section->elementSize));
+      std::istringstream ss(std::string(buf.begin(), buf.end()));
       ss >> groups[i].element;
       groups[i].element--;
       groups[i].group = section->group;
@@ -387,28 +383,25 @@ class GambitReader : public MeshReader {
         m_mesh.seekg(section->lineSize - section->elementSize * ELEMENTS_PER_LINE_GROUP,
                      std::fstream::cur);
     }
-
-    delete[] buf;
   }
 
   /**
    * Reads all groups numbers.
-   * In contrast to readGroups(unsigned int, unsigned int, ElementGroup*) it
+   * In contrast to readGroups(size_t, size_t, ElementGroup*) it
    * returns the group numbers sorted according to the elements.
    */
   void readGroups(int* groups) {
     logInfo() << "Reading group information";
 
-    ElementGroup* map = new ElementGroup[nElements()];
-    readGroups(0, nElements(), map);
+    std::vector<ElementGroup> map(nElements());
+    readGroups(0, nElements(), map.data());
 
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (unsigned int i = 0; i < nElements(); i++)
+    for (std::size_t i = 0; i < nElements(); i++) {
       groups[map[i].element] = map[i].group;
-
-    delete[] map;
+    }
   }
 
   /**
@@ -418,7 +411,7 @@ class GambitReader : public MeshReader {
    * @param elements Buffer to store boundaries. Each boundary consists of the
    * element, the face and the boundary type.
    */
-  void readBoundaries(unsigned int start, unsigned int count, GambitBoundaryFace* boundaries) {
+  void readBoundaries(std::size_t start, std::size_t count, GambitBoundaryFace* boundaries) {
     m_mesh.clear();
 
     // Get the boundary, were we should start reading
@@ -427,21 +420,21 @@ class GambitReader : public MeshReader {
          section++)
       start -= section->nLines;
 
-    char* buf;
+    std::vector<char> buf;
 
     if (section->variableLineLength) {
       m_mesh.seekg(section->seekPosition);
       for (size_t i = 0; i < start; i++)
         m_mesh.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
 
-      buf = 0L;
+      buf.resize(0);
     } else {
       m_mesh.seekg(section->seekPosition + start * section->lineSize);
 
-      buf = new char[section->elementSize + section->elementTypeSize + section->faceSize];
+      buf.resize(section->elementSize + section->elementTypeSize + section->faceSize);
     }
 
-    for (unsigned int i = 0; i < count; i++) {
+    for (std::size_t i = 0; i < count; i++) {
       if (start >= section->nLines) {
         // Are we starting a new section in this iteration?
         start = 0;
@@ -449,11 +442,12 @@ class GambitReader : public MeshReader {
 
         m_mesh.seekg(section->seekPosition);
 
-        delete[] buf;
-        if (section->variableLineLength)
-          buf = 0L;
-        else
-          buf = new char[section->elementSize + section->elementTypeSize + section->faceSize];
+        if (section->variableLineLength) {
+          buf.resize(0);
+        }
+        else {
+          buf.resize(section->elementSize + section->elementTypeSize + section->faceSize);
+        }
       }
 
       if (section->variableLineLength) {
@@ -462,13 +456,13 @@ class GambitReader : public MeshReader {
         m_mesh >> elementType;
         m_mesh >> boundaries[i].face;
       } else {
-        m_mesh.read(buf, section->elementSize + section->elementTypeSize + section->faceSize);
+        m_mesh.read(buf.data(), section->elementSize + section->elementTypeSize + section->faceSize);
 
-        std::istringstream ssE(std::string(buf, section->elementSize));
+        std::istringstream ssE(std::string(buf.begin(), buf.begin() + section->elementSize));
         ssE >> boundaries[i].element;
 
         std::istringstream ssF(
-            std::string(&buf[section->elementSize + section->elementTypeSize], section->faceSize));
+            std::string(buf.begin() + section->elementSize + section->elementTypeSize, buf.begin() + section->elementSize + section->elementTypeSize + section->faceSize));
         ssF >> boundaries[i].face;
 
         // Seek to next position
@@ -481,10 +475,8 @@ class GambitReader : public MeshReader {
       boundaries[i].face--;
       boundaries[i].type = section->type;
 
-      start++; // Line in the current section
+      ++start; // Line in the current section
     }
-
-    delete[] buf;
   }
 
   /**
@@ -499,14 +491,13 @@ class GambitReader : public MeshReader {
   void readBoundaries(int* boundaries) {
     logInfo() << "Reading boundary conditions";
 
-    unsigned int nBnds = nBoundaries();
-    GambitBoundaryFace* faces = new GambitBoundaryFace[nBoundaries()];
-    readBoundaries(0, nBnds, faces);
+    std::size_t nBnds = nBoundaries();
+    std::vector<GambitBoundaryFace> faces(nBnds);
+    readBoundaries(0, nBnds, faces.data());
 
-    for (unsigned int i = 0; i < nBnds; i++)
+    for (std::size_t i = 0; i < nBnds; i++) {
       boundaries[faces[i].element * 4 + faces[i].face] = faces[i].type;
-
-    delete[] faces;
+    }
   }
 
   private:
diff --git a/src/meshreader/MeshReader.h b/src/meshreader/MeshReader.h
index ee7d968..6d97832 100644
--- a/src/meshreader/MeshReader.h
+++ b/src/meshreader/MeshReader.h
@@ -23,7 +23,7 @@ class MeshReader {
     /** Start of the section */
     size_t seekPosition;
     /** Number of elements (lines) in the section */
-    unsigned int nLines;
+    std::size_t nLines;
     /** Line size */
     size_t lineSize;
   };
@@ -45,12 +45,12 @@ class MeshReader {
       logError() << "Could not open mesh file" << meshFile;
   }
 
-  virtual unsigned int nVertices() const = 0;
-  virtual unsigned int nElements() const = 0;
+  virtual std::size_t nVertices() const = 0;
+  virtual std::size_t nElements() const = 0;
   /**
    * @return Number of boundary faces
    */
-  virtual unsigned int nBoundaries() const = 0;
+  virtual std::size_t nBoundaries() const = 0;
 
   /**
    * Reads vertices from start tp start+count from the file and stores them in
@@ -60,12 +60,12 @@ class MeshReader {
    * responsible for allocating the buffer. The size of the buffer must be
    * count*dimensions
    */
-  virtual void readVertices(unsigned int start, unsigned int count, double* vertices) = 0;
+  virtual void readVertices(std::size_t start, std::size_t count, double* vertices) = 0;
 
   /**
    * Read all vertices
    *
-   * @see readVertices(unsigned int, unsigned int, double*)
+   * @see readVertices(size_t, size_t, double*)
    */
   void readVertices(double* vertices) {
     logInfo() << "Reading vertices";
@@ -80,14 +80,14 @@ class MeshReader {
    * responsible for allocating the buffer. The Size of the buffer must be
    * count*vertices_per_element.
    */
-  virtual void readElements(unsigned int start, unsigned int count, int* elements) = 0;
+  virtual void readElements(std::size_t start, std::size_t count, std::size_t* elements) = 0;
 
   /**
    * Reads all elements
    *
-   * @see readElements(unsigned int, unsigned int, unsinged int*)
+   * @see readElements(size_t, size_t, size_t*)
    */
-  void readElements(int* elements) {
+  void readElements(std::size_t* elements) {
     logInfo() << "Reading elements";
     readElements(0, nElements(), elements);
   }
diff --git a/src/meshreader/ParallelFidapReader.h b/src/meshreader/ParallelFidapReader.h
index 5f563f6..23122bc 100644
--- a/src/meshreader/ParallelFidapReader.h
+++ b/src/meshreader/ParallelFidapReader.h
@@ -19,6 +19,7 @@
 
 #include "FidapReader.h"
 #include "ParallelMeshReader.h"
+#include "third_party/MPITraits.h"
 
 class ParallelFidapReader : public ParallelMeshReader<FidapReader> {
   private:
@@ -31,30 +32,32 @@ class ParallelFidapReader : public ParallelMeshReader<FidapReader> {
   ParallelFidapReader(const char* meshFile, MPI_Comm comm = MPI_COMM_WORLD)
       : ParallelMeshReader<FidapReader>(meshFile, comm) {}
 
-  void readElements(int* elements) {
+  void readElements(std::size_t* elements) {
     ParallelMeshReader<FidapReader>::readElements(elements);
 
     // Create the face map
-    unsigned int chunkSize = (nElements() + m_nProcs - 1) / m_nProcs;
-    unsigned int maxChunkSize = chunkSize;
-    if (m_rank == m_nProcs - 1)
+    std::size_t chunkSize = (nElements() + m_nProcs - 1) / m_nProcs;
+    const std::size_t maxChunkSize = chunkSize;
+    if (m_rank == m_nProcs - 1) {
       chunkSize = nElements() - (m_nProcs - 1) * chunkSize;
+    }
 
     FidapReader::createFaceMap(elements, chunkSize, m_faceMap);
-    for (std::map<FaceVertex, FaceElement>::iterator i = m_faceMap.begin(); i != m_faceMap.end();
-         i++)
-      i->second.element += m_rank * maxChunkSize;
+    for (auto& face : m_faceMap) {
+      face.second.element += m_rank * maxChunkSize;
+    }
   }
 
   /**
    * @copydoc ParallelGambitReader::readGroups
    */
   void readGroups(int* groups) {
-    unsigned int chunkSize = (nElements() + m_nProcs - 1) / m_nProcs;
+    std::size_t chunkSize = (nElements() + m_nProcs - 1) / m_nProcs;
 
     if (m_rank == 0) {
       // Allocate second buffer so we can read and send in parallel
-      int* groups2 = new int[chunkSize];
+      std::vector<int> tempGroups(chunkSize);
+      int* groups2 = tempGroups.data();
       if (m_nProcs % 2 == 0)
         // swap once so we have the correct buffer at the end
         swap(groups, groups2);
@@ -71,7 +74,7 @@ class ParallelFidapReader : public ParallelMeshReader<FidapReader> {
 
       if (m_nProcs > 1) {
         // Read last one
-        unsigned int lastChunkSize = nElements() - (m_nProcs - 1) * chunkSize;
+        const std::size_t lastChunkSize = nElements() - (m_nProcs - 1) * chunkSize;
         logInfo() << "Reading group information part" << (m_nProcs - 1) << "of" << m_nProcs;
         m_serialReader.readGroups((m_nProcs - 1) * chunkSize, lastChunkSize, groups);
         MPI_Wait(&request, MPI_STATUS_IGNORE);
@@ -83,8 +86,6 @@ class ParallelFidapReader : public ParallelMeshReader<FidapReader> {
       logInfo() << "Reading group information part" << m_nProcs << "of" << m_nProcs;
       m_serialReader.readGroups(0, chunkSize, groups);
       MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-      delete[] groups2;
     } else {
       if (m_rank == m_nProcs - 1)
         chunkSize = nElements() - (m_nProcs - 1) * chunkSize;
@@ -123,36 +124,34 @@ class ParallelFidapReader : public ParallelMeshReader<FidapReader> {
     std::vector<FaceElement> facePos;
     std::vector<int> faceType;
 
-    unsigned int chunkSize = (nElements() + m_nProcs - 1) / m_nProcs;
-    unsigned int maxChunkSize = chunkSize;
+    std::size_t chunkSize = (nElements() + m_nProcs - 1) / m_nProcs;
+    const std::size_t maxChunkSize = chunkSize;
 
     if (m_rank == 0) {
-      FidapBoundaryFace* faces = new FidapBoundaryFace[maxChunkSize];
+      std::vector<FidapBoundaryFace> faces(maxChunkSize);
 
-      std::vector<int>* aggregator = new std::vector<int>[m_nProcs - 1];
-      unsigned int* sizes = new unsigned int[m_nProcs - 1];
-      MPI_Request* requests = new MPI_Request[(m_nProcs - 1) * 2];
-      for (int i = 0; i < m_nProcs - 1; i++) {
-        requests[i * 2] = MPI_REQUEST_NULL;
-        requests[i * 2 + 1] = MPI_REQUEST_NULL;
-      }
+      std::vector<std::vector<int>> aggregator(m_nProcs - 1);
+      std::vector<std::size_t> sizes(m_nProcs - 1);
+      std::vector<MPI_Request> requests((m_nProcs - 1) * 2, MPI_REQUEST_NULL);
 
-      unsigned int nChunks = (nBoundaries() + chunkSize - 1) / chunkSize;
-      for (unsigned int i = 0; i < nChunks; i++) {
+      const std::size_t nChunks = (nBoundaries() + chunkSize - 1) / chunkSize;
+      for (std::size_t i = 0; i < nChunks; i++) {
         logInfo() << "Reading boundary conditions part" << (i + 1) << "of" << nChunks;
 
-        if (i == nChunks - 1)
+        if (i == nChunks - 1) {
           chunkSize = nBoundaries() - (nChunks - 1) * chunkSize;
+        }
 
-        m_serialReader.readBoundaries(i * maxChunkSize, chunkSize, faces);
+        m_serialReader.readBoundaries(i * maxChunkSize, chunkSize, faces.data());
 
         // Wait for all sending from last iteration
-        MPI_Waitall((m_nProcs - 1) * 2, requests, MPI_STATUSES_IGNORE);
-        for (int j = 0; j < m_nProcs - 1; j++)
+        MPI_Waitall((m_nProcs - 1) * 2, requests.data(), MPI_STATUSES_IGNORE);
+        for (int j = 0; j < m_nProcs - 1; j++) {
           aggregator[j].clear();
+        }
 
         // Sort boundary conditions into the corresponding aggregator
-        for (unsigned int j = 0; j < chunkSize; j++) {
+        for (std::size_t j = 0; j < chunkSize; j++) {
           FaceVertex v(faces[j].vertices);
 
           if (v.vertices[1] < vertexChunk) {
@@ -161,8 +160,8 @@ class ParallelFidapReader : public ParallelMeshReader<FidapReader> {
           } else {
             // Face for another processor
             // Serials the struct to make sending easier
-            unsigned int proc = v.vertices[1] / vertexChunk;
-            aggregator[proc - 1].insert(aggregator[proc - 1].end(), v.vertices, v.vertices + 3);
+            int proc = v.vertices[1] / vertexChunk;
+            aggregator[proc - 1].insert(aggregator[proc - 1].end(), v.vertices.begin(), v.vertices.end());
             aggregator[proc - 1].push_back(faces[j].type);
           }
         }
@@ -173,54 +172,47 @@ class ParallelFidapReader : public ParallelMeshReader<FidapReader> {
             continue;
 
           sizes[j] = aggregator[j].size() / 4; // 3 vertices + face type
-          MPI_Isend(&sizes[j], 1, MPI_UNSIGNED, j + 1, 0, m_comm, &requests[j * 2]);
+          MPI_Isend(&sizes[j], 1, tndm::mpi_type_t<std::size_t>(), j + 1, 0, m_comm, &requests[j * 2]);
           MPI_Isend(&aggregator[j][0], aggregator[j].size(), MPI_INT, j + 1, 0, m_comm,
                     &requests[j * 2 + 1]);
         }
       }
 
-      MPI_Waitall((m_nProcs - 1) * 2, requests, MPI_STATUSES_IGNORE);
-
-      delete[] faces;
-      delete[] aggregator;
+      MPI_Waitall((m_nProcs - 1) * 2, requests.data(), MPI_STATUSES_IGNORE);
 
       // Send finish signal to all other processes
-      memset(sizes, 0, (m_nProcs - 1) * sizeof(unsigned int));
-      for (int i = 0; i < m_nProcs - 1; i++)
-        MPI_Isend(&sizes[i], 1, MPI_UNSIGNED, i + 1, 0, m_comm, &requests[i]);
-      MPI_Waitall(m_nProcs - 1, requests, MPI_STATUSES_IGNORE);
-
-      delete[] sizes;
-      delete[] requests;
+      std::fill(sizes.begin(), sizes.end(), 0);
+      for (int i = 0; i < m_nProcs - 1; i++) {
+        MPI_Isend(&sizes[i], 1, tndm::mpi_type_t<std::size_t>(), i + 1, 0, m_comm, &requests[i]);
+      }
+      MPI_Waitall(m_nProcs - 1, requests.data(), MPI_STATUSES_IGNORE);
     } else {
       if (m_rank == m_nProcs - 1)
         chunkSize = nElements() - (m_nProcs - 1) * chunkSize;
 
       // Allocate enough memory
-      int* buf = new int[chunkSize * 4];
+      std::vector<std::size_t> buf(chunkSize * 4);
 
       while (true) {
-        unsigned int size;
-        MPI_Recv(&size, 1, MPI_UNSIGNED, 0, 0, m_comm, MPI_STATUS_IGNORE);
+        std::size_t size;
+        MPI_Recv(&size, 1, tndm::mpi_type_t<std::size_t>(), 0, 0, m_comm, MPI_STATUS_IGNORE);
         if (size == 0)
           // Finished
           break;
 
-        MPI_Recv(buf, size * 4, MPI_INT, 0, 0, m_comm, MPI_STATUS_IGNORE);
+        MPI_Recv(buf.data(), size * 4, tndm::mpi_type_t<std::size_t>(), 0, 0, m_comm, MPI_STATUS_IGNORE);
 
-        for (unsigned int i = 0; i < size * 4; i += 4) {
-          FaceVertex v(&buf[i]);
+        for (std::size_t i = 0; i < size * 4; i += 4) {
+          FaceVertex v(buf[i], buf[i+1], buf[i+2]);
           facePos.push_back(m_faceMap.at(v));
           faceType.push_back(buf[i + 3]);
         }
       }
-
-      delete[] buf;
     }
 
     // Distribute the faces to the final ranks
     PCU_Comm_Begin();
-    for (unsigned int i = 0; i < facePos.size(); i++) {
+    for (std::size_t i = 0; i < facePos.size(); i++) {
       if (facePos[i].element / maxChunkSize == static_cast<unsigned int>(m_rank))
         boundaries[(facePos[i].element % maxChunkSize) * 4 + facePos[i].side] = faceType[i];
       else {
diff --git a/src/meshreader/ParallelGMSHReader.h b/src/meshreader/ParallelGMSHReader.h
index 4d9426b..66b2902 100644
--- a/src/meshreader/ParallelGMSHReader.h
+++ b/src/meshreader/ParallelGMSHReader.h
@@ -43,10 +43,10 @@ template <typename P> class ParallelGMSHReader {
     MPI_Bcast(&nElements_, 1, tndm::mpi_type_t<decltype(nVertices_)>(), 0, comm_);
   }
 
-  [[nodiscard]] unsigned int nVertices() const { return nVertices_; }
-  [[nodiscard]] unsigned int nElements() const { return nElements_; }
-  void readElements(int* elements) const {
-    static_assert(sizeof(GMSHBuilder<Dim>::element_t) == (Dim + 1) * sizeof(int));
+  [[nodiscard]] std::size_t nVertices() const { return nVertices_; }
+  [[nodiscard]] std::size_t nElements() const { return nElements_; }
+  void readElements(std::size_t* elements) const {
+    static_assert(sizeof(GMSHBuilder<Dim>::element_t) == (Dim + 1) * sizeof(std::size_t));
     scatter(builder_.elements.data()->data(), elements, nElements(), Dim + 1);
   }
   void readVertices(double* vertices) const {
@@ -88,18 +88,18 @@ template <typename P> class ParallelGMSHReader {
       std::sort(v2f.begin(), v2f.end());
     }
 
-    for (unsigned elNo = 0; elNo < nElements; ++elNo) {
-      for (unsigned localFctNo = 0; localFctNo < nFacetsPerTet; ++localFctNo) {
-        unsigned nodes[nVertsPerTri];
-        for (unsigned localNo = 0; localNo < nVertsPerTri; ++localNo) {
+    for (std::size_t elNo = 0; elNo < nElements; ++elNo) {
+      for (int localFctNo = 0; localFctNo < nFacetsPerTet; ++localFctNo) {
+        std::array<std::size_t, nVertsPerTri> nodes;
+        for (int localNo = 0; localNo < nVertsPerTri; ++localNo) {
           const auto localNoElement = Facet2Nodes[localFctNo][localNo];
           nodes[localNo] = builder_.elements[elNo][localNoElement];
         }
-        std::vector<unsigned> intersect[nVertsPerTri - 1];
+        std::array<std::vector<std::size_t>, nVertsPerTri - 1> intersect;
         std::set_intersection(vertex2facets[nodes[0]].begin(), vertex2facets[nodes[0]].end(),
                               vertex2facets[nodes[1]].begin(), vertex2facets[nodes[1]].end(),
                               std::back_inserter(intersect[0]));
-        for (unsigned node = 2; node < nVertsPerTri; ++node) {
+        for (int node = 2; node < nVertsPerTri; ++node) {
           std::set_intersection(intersect[node - 2].begin(), intersect[node - 2].end(),
                                 vertex2facets[nodes[node]].begin(),
                                 vertex2facets[nodes[node]].end(),
@@ -158,8 +158,8 @@ template <typename P> class ParallelGMSHReader {
   MPI_Comm comm_;
   GMSHBuilder<Dim> builder_;
   std::vector<bc_t> bcs_;
-  unsigned int nVertices_ = 0;
-  unsigned int nElements_ = 0;
+  std::size_t nVertices_ = 0;
+  std::size_t nElements_ = 0;
 };
 
 } // namespace puml
diff --git a/src/meshreader/ParallelGambitReader.h b/src/meshreader/ParallelGambitReader.h
index 08bd5ed..06f2075 100644
--- a/src/meshreader/ParallelGambitReader.h
+++ b/src/meshreader/ParallelGambitReader.h
@@ -18,6 +18,8 @@
 #include "GambitReader.h"
 #include "ParallelMeshReader.h"
 
+#include "third_party/MPITraits.h"
+
 namespace puml {
 
 class ParallelGambitReader : public ParallelMeshReader<GambitReader> {
@@ -35,35 +37,33 @@ class ParallelGambitReader : public ParallelMeshReader<GambitReader> {
    * This is a collective operation.
    */
   void readGroups(int* groups) {
-    unsigned int chunkSize = (nElements() + m_nProcs - 1) / m_nProcs;
+    std::size_t chunkSize = (nElements() + m_nProcs - 1) / m_nProcs;
 
     if (m_rank == 0) {
-      unsigned int maxChunkSize = chunkSize;
-      ElementGroup* map = new ElementGroup[maxChunkSize];
+      std::size_t maxChunkSize = chunkSize;
+      std::vector<ElementGroup> map (maxChunkSize);
 
-      std::vector<int>* aggregator = new std::vector<int>[m_nProcs - 1];
-      unsigned int* sizes = new unsigned int[m_nProcs - 1];
-      MPI_Request* requests = new MPI_Request[(m_nProcs - 1) * 2];
-      for (int i = 0; i < m_nProcs - 1; i++) {
-        requests[i * 2] = MPI_REQUEST_NULL;
-        requests[i * 2 + 1] = MPI_REQUEST_NULL;
-      }
+      std::vector<std::vector<int>> aggregator (m_nProcs - 1);
+      std::vector<std::size_t> sizes (m_nProcs - 1);
+      std::vector<MPI_Request> requests ((m_nProcs - 1) * 2, MPI_REQUEST_NULL);
 
       for (int i = 0; i < m_nProcs; i++) {
         logInfo() << "Reading group information part" << (i + 1) << "of" << m_nProcs;
 
-        if (i == m_nProcs - 1)
+        if (i == m_nProcs - 1) {
           chunkSize = nElements() - (m_nProcs - 1) * chunkSize;
+        }
 
-        m_serialReader.readGroups(i * maxChunkSize, chunkSize, map);
+        m_serialReader.readGroups(i * maxChunkSize, chunkSize, map.data());
 
         // Wait for all sending from last iteration
-        MPI_Waitall((m_nProcs - 1) * 2, requests, MPI_STATUSES_IGNORE);
-        for (int j = 0; j < m_nProcs - 1; j++)
+        MPI_Waitall((m_nProcs - 1) * 2, requests.data(), MPI_STATUSES_IGNORE);
+        for (int j = 0; j < m_nProcs - 1; j++) {
           aggregator[j].clear();
+        }
 
         // Sort group numbers into the corresponding aggregator
-        for (unsigned int j = 0; j < chunkSize; j++) {
+        for (std::size_t j = 0; j < chunkSize; j++) {
           if (map[j].element < maxChunkSize)
             // Local element
             groups[map[j].element] = map[j].group;
@@ -78,43 +78,39 @@ class ParallelGambitReader : public ParallelMeshReader<GambitReader> {
 
         // Send send aggregated mapping
         for (int j = 0; j < m_nProcs - 1; j++) {
-          if (aggregator[j].empty())
+          if (aggregator[j].empty()) {
             continue;
+          }
 
           sizes[j] = aggregator[j].size() / 2; // element id + group number
-          MPI_Isend(&sizes[j], 1, MPI_UNSIGNED, j + 1, 0, m_comm, &requests[j * 2]);
+          MPI_Isend(&sizes[j], 1, tndm::mpi_type_t<std::size_t>(), j + 1, 0, m_comm, &requests[j * 2]);
           MPI_Isend(&aggregator[j][0], aggregator[j].size(), MPI_INT, j + 1, 0, m_comm,
                     &requests[j * 2 + 1]);
         }
       }
 
-      MPI_Waitall((m_nProcs - 1) * 2, requests, MPI_STATUSES_IGNORE);
-
-      delete[] map;
-      delete[] aggregator;
-      delete[] sizes;
-      delete[] requests;
+      MPI_Waitall((m_nProcs - 1) * 2, requests.data(), MPI_STATUSES_IGNORE);
     } else {
-      if (m_rank == m_nProcs - 1)
+      if (m_rank == m_nProcs - 1) {
         chunkSize = nElements() - (m_nProcs - 1) * chunkSize;
+      }
 
       // Allocate enough memory
-      unsigned int* buf = new unsigned int[chunkSize * 2];
+      std::vector<int> buf(chunkSize * 2);
 
       unsigned int recieved = 0;
 
       while (recieved < chunkSize) {
-        unsigned int size;
-        MPI_Recv(&size, 1, MPI_UNSIGNED, 0, 0, m_comm, MPI_STATUS_IGNORE);
-        MPI_Recv(buf, size * 2, MPI_INT, 0, 0, m_comm, MPI_STATUS_IGNORE);
+        std::size_t size;
+        MPI_Recv(&size, 1, tndm::mpi_type_t<std::size_t>(), 0, 0, m_comm, MPI_STATUS_IGNORE);
+        MPI_Recv(buf.data(), size * 2, MPI_INT, 0, 0, m_comm, MPI_STATUS_IGNORE);
 
-        for (unsigned int i = 0; i < size * 2; i += 2)
+        for (std::size_t i = 0; i < size * 2; i += 2) {
           groups[buf[i]] = buf[i + 1];
+        }
 
         recieved += size;
       }
-
-      delete[] buf;
     }
   }
 
@@ -129,36 +125,33 @@ class ParallelGambitReader : public ParallelMeshReader<GambitReader> {
    * @todo Only tetrahedral meshes are supported
    */
   void readBoundaries(int* boundaries) {
-    unsigned int chunkSize = (nElements() + m_nProcs - 1) / m_nProcs;
+    std::size_t chunkSize = (nElements() + m_nProcs - 1) / m_nProcs;
 
     if (m_rank == 0) {
-      unsigned int maxChunkSize = chunkSize;
-      GambitBoundaryFace* faces = new GambitBoundaryFace[maxChunkSize];
+      std::size_t maxChunkSize = chunkSize;
+      std::vector<GambitBoundaryFace> faces (maxChunkSize);
 
-      std::vector<int>* aggregator = new std::vector<int>[m_nProcs - 1];
-      unsigned int* sizes = new unsigned int[m_nProcs - 1];
-      MPI_Request* requests = new MPI_Request[(m_nProcs - 1) * 2];
-      for (int i = 0; i < m_nProcs - 1; i++) {
-        requests[i * 2] = MPI_REQUEST_NULL;
-        requests[i * 2 + 1] = MPI_REQUEST_NULL;
-      }
+      std::vector<std::vector<int>> aggregator (m_nProcs - 1);
+      std::vector<std::size_t> sizes (m_nProcs - 1);
+      std::vector<MPI_Request> requests ((m_nProcs - 1) * 2, MPI_REQUEST_NULL);
 
-      unsigned int nChunks = (nBoundaries() + chunkSize - 1) / chunkSize;
-      for (unsigned int i = 0; i < nChunks; i++) {
+      std::size_t nChunks = (nBoundaries() + chunkSize - 1) / chunkSize;
+      for (std::size_t i = 0; i < nChunks; i++) {
         logInfo() << "Reading boundary conditions part" << (i + 1) << "of" << nChunks;
 
-        if (i == nChunks - 1)
+        if (i == nChunks - 1) {
           chunkSize = nBoundaries() - (nChunks - 1) * chunkSize;
+        }
 
-        m_serialReader.readBoundaries(i * maxChunkSize, chunkSize, faces);
+        m_serialReader.readBoundaries(i * maxChunkSize, chunkSize, faces.data());
 
         // Wait for all sending from last iteration
-        MPI_Waitall((m_nProcs - 1) * 2, requests, MPI_STATUSES_IGNORE);
+        MPI_Waitall((m_nProcs - 1) * 2, requests.data(), MPI_STATUSES_IGNORE);
         for (int j = 0; j < m_nProcs - 1; j++)
           aggregator[j].clear();
 
         // Sort boundary conditions into the corresponding aggregator
-        for (unsigned int j = 0; j < chunkSize; j++) {
+        for (std::size_t j = 0; j < chunkSize; j++) {
           if (faces[j].element < maxChunkSize)
             // Local element
             boundaries[faces[j].element * 4 + faces[j].face] = faces[j].type;
@@ -173,50 +166,47 @@ class ParallelGambitReader : public ParallelMeshReader<GambitReader> {
 
         // Send send aggregated values
         for (int j = 0; j < m_nProcs - 1; j++) {
-          if (aggregator[j].empty())
+          if (aggregator[j].empty()) {
             continue;
+          }
 
           sizes[j] = aggregator[j].size() / 2; // element id + face type
-          MPI_Isend(&sizes[j], 1, MPI_UNSIGNED, j + 1, 0, m_comm, &requests[j * 2]);
+          MPI_Isend(&sizes[j], 1, tndm::mpi_type_t<std::size_t>(), j + 1, 0, m_comm, &requests[j * 2]);
           MPI_Isend(&aggregator[j][0], aggregator[j].size(), MPI_INT, j + 1, 0, m_comm,
                     &requests[j * 2 + 1]);
         }
       }
 
-      MPI_Waitall((m_nProcs - 1) * 2, requests, MPI_STATUSES_IGNORE);
-
-      delete[] faces;
-      delete[] aggregator;
+      MPI_Waitall((m_nProcs - 1) * 2, requests.data(), MPI_STATUSES_IGNORE);
 
       // Send finish signal to all other processes
-      memset(sizes, 0, (m_nProcs - 1) * sizeof(unsigned int));
-      for (int i = 0; i < m_nProcs - 1; i++)
-        MPI_Isend(&sizes[i], 1, MPI_UNSIGNED, i + 1, 0, m_comm, &requests[i]);
-      MPI_Waitall(m_nProcs - 1, requests, MPI_STATUSES_IGNORE);
-
-      delete[] sizes;
-      delete[] requests;
+      std::fill(sizes.begin(), sizes.end(), 0);
+      for (int i = 0; i < m_nProcs - 1; i++) {
+        MPI_Isend(&sizes[i], 1, tndm::mpi_type_t<std::size_t>(), i + 1, 0, m_comm, &requests[i]);
+      }
+      MPI_Waitall(m_nProcs - 1, requests.data(), MPI_STATUSES_IGNORE);
     } else {
-      if (m_rank == m_nProcs - 1)
+      if (m_rank == m_nProcs - 1) {
         chunkSize = nElements() - (m_nProcs - 1) * chunkSize;
+      }
 
       // Allocate enough memory
-      int* buf = new int[chunkSize * 2];
+      std::vector<int> buf (chunkSize * 2);
 
       while (true) {
-        unsigned int size;
-        MPI_Recv(&size, 1, MPI_UNSIGNED, 0, 0, m_comm, MPI_STATUS_IGNORE);
-        if (size == 0)
+        std::size_t size;
+        MPI_Recv(&size, 1, tndm::mpi_type_t<std::size_t>(), 0, 0, m_comm, MPI_STATUS_IGNORE);
+        if (size == 0) {
           // Finished
           break;
+        }
 
-        MPI_Recv(buf, size * 2, MPI_INT, 0, 0, m_comm, MPI_STATUS_IGNORE);
+        MPI_Recv(buf.data(), size * 2, MPI_INT, 0, 0, m_comm, MPI_STATUS_IGNORE);
 
-        for (unsigned int i = 0; i < size * 2; i += 2)
+        for (std::size_t i = 0; i < size * 2; i += 2) {
           boundaries[buf[i]] = buf[i + 1];
+        }
       }
-
-      delete[] buf;
     }
   }
 };
diff --git a/src/meshreader/ParallelMeshReader.h b/src/meshreader/ParallelMeshReader.h
index b851c9e..73069b9 100644
--- a/src/meshreader/ParallelMeshReader.h
+++ b/src/meshreader/ParallelMeshReader.h
@@ -15,14 +15,17 @@
 
 #include <mpi.h>
 
+#include <array>
 #include <vector>
 
+#include "third_party/MPITraits.h"
+
 template <class R> class ParallelMeshReader {
   private:
   // Some variables that are required by all processes
-  unsigned int m_nVertices;
-  unsigned int m_nElements;
-  unsigned int m_nBoundaries;
+  std::size_t m_nVertices;
+  std::size_t m_nElements;
+  std::size_t m_nBoundaries;
 
   protected:
   const MPI_Comm m_comm;
@@ -50,7 +53,7 @@ template <class R> class ParallelMeshReader {
   virtual ~ParallelMeshReader() {}
 
   void open(const char* meshFile) {
-    unsigned int vars[3];
+    std::array<std::size_t, 3> vars;
 
     if (m_rank == 0) {
       m_serialReader.open(meshFile);
@@ -60,21 +63,21 @@ template <class R> class ParallelMeshReader {
       vars[2] = m_serialReader.nBoundaries();
     }
 
-    MPI_Bcast(vars, 3, MPI_UNSIGNED, 0, m_comm);
+    MPI_Bcast(vars.data(), 3, tndm::mpi_type_t<std::size_t>(), 0, m_comm);
 
     m_nVertices = vars[0];
     m_nElements = vars[1];
     m_nBoundaries = vars[2];
   }
 
-  unsigned int nVertices() const { return m_nVertices; }
+  std::size_t nVertices() const { return m_nVertices; }
 
-  unsigned int nElements() const { return m_nElements; }
+  std::size_t nElements() const { return m_nElements; }
 
   /**
    * @return Number of boundary faces
    */
-  unsigned int nBoundaries() const { return m_nBoundaries; }
+  std::size_t nBoundaries() const { return m_nBoundaries; }
 
   /**
    * Reads all vertices
@@ -86,11 +89,12 @@ template <class R> class ParallelMeshReader {
    * @todo Only 3 dimensional meshes are supported
    */
   void readVertices(double* vertices) {
-    unsigned int chunkSize = (m_nVertices + m_nProcs - 1) / m_nProcs;
+    std::size_t chunkSize = (m_nVertices + m_nProcs - 1) / m_nProcs;
 
     if (m_rank == 0) {
       // Allocate second buffer so we can read and send in parallel
-      double* vertices2 = new double[chunkSize * 3];
+      std::vector<double> tempVertices(chunkSize * 3);
+      double* vertices2 = tempVertices.data();
       if (m_nProcs % 2 == 0)
         // swap once so we have the correct buffer at the end
         swap(vertices, vertices2);
@@ -106,7 +110,7 @@ template <class R> class ParallelMeshReader {
 
       if (m_nProcs > 1) {
         // Read last one
-        unsigned int lastChunkSize = m_nVertices - (m_nProcs - 1) * chunkSize;
+        const std::size_t lastChunkSize = m_nVertices - (m_nProcs - 1) * chunkSize;
         logInfo() << "Reading vertices part" << (m_nProcs - 1) << "of" << m_nProcs;
         m_serialReader.readVertices((m_nProcs - 1) * chunkSize, lastChunkSize, vertices);
         MPI_Wait(&request, MPI_STATUS_IGNORE);
@@ -118,8 +122,6 @@ template <class R> class ParallelMeshReader {
       logInfo() << "Reading vertices part" << m_nProcs << "of" << m_nProcs;
       m_serialReader.readVertices(0, chunkSize, vertices);
       MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-      delete[] vertices2;
     } else {
       if (m_rank == m_nProcs - 1)
         chunkSize = m_nVertices - (m_nProcs - 1) * chunkSize;
@@ -137,12 +139,13 @@ template <class R> class ParallelMeshReader {
    *
    * @todo Only tetrahedral meshes are supported
    */
-  virtual void readElements(int* elements) {
-    unsigned int chunkSize = (m_nElements + m_nProcs - 1) / m_nProcs;
+  virtual void readElements(std::size_t* elements) {
+    std::size_t chunkSize = (m_nElements + m_nProcs - 1) / m_nProcs;
 
     if (m_rank == 0) {
       // Allocate second buffer so we can read and send in parallel
-      int* elements2 = new int[chunkSize * 4];
+      std::vector<std::size_t> tempElements(chunkSize * 4);
+      std::size_t* elements2 = tempElements.data();
       if (m_nProcs % 2 == 0)
         // swap once so we have the correct buffer at the end
         swap(elements, elements2);
@@ -153,17 +156,17 @@ template <class R> class ParallelMeshReader {
         logInfo() << "Reading elements part" << i << "of" << m_nProcs;
         m_serialReader.readElements(i * chunkSize, chunkSize, elements);
         MPI_Wait(&request, MPI_STATUS_IGNORE);
-        MPI_Isend(elements, chunkSize * 4, MPI_INT, i, 0, m_comm, &request);
+        MPI_Isend(elements, chunkSize * 4, tndm::mpi_type_t<std::size_t>(), i, 0, m_comm, &request);
         swap(elements, elements2);
       }
 
       if (m_nProcs > 1) {
         // Read last one
-        unsigned int lastChunkSize = m_nElements - (m_nProcs - 1) * chunkSize;
+        const std::size_t lastChunkSize = m_nElements - (m_nProcs - 1) * chunkSize;
         logInfo() << "Reading elements part" << (m_nProcs - 1) << "of" << m_nProcs;
         m_serialReader.readElements((m_nProcs - 1) * chunkSize, lastChunkSize, elements);
         MPI_Wait(&request, MPI_STATUS_IGNORE);
-        MPI_Isend(elements, lastChunkSize * 4, MPI_INT, m_nProcs - 1, 0, m_comm, &request);
+        MPI_Isend(elements, lastChunkSize * 4, tndm::mpi_type_t<std::size_t>(), m_nProcs - 1, 0, m_comm, &request);
         swap(elements, elements2);
       }
 
@@ -171,13 +174,11 @@ template <class R> class ParallelMeshReader {
       logInfo() << "Reading elements part" << m_nProcs << "of" << m_nProcs;
       m_serialReader.readElements(0, chunkSize, elements);
       MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-      delete[] elements2;
     } else {
       if (m_rank == m_nProcs - 1)
         chunkSize = m_nElements - (m_nProcs - 1) * chunkSize;
 
-      MPI_Recv(elements, chunkSize * 4, MPI_INT, 0, 0, m_comm, MPI_STATUS_IGNORE);
+      MPI_Recv(elements, chunkSize * 4, tndm::mpi_type_t<std::size_t>(), 0, 0, m_comm, MPI_STATUS_IGNORE);
     }
   }
 

From 7748c255f8c69fe7ded75d9a0eee1ed71586cba2 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Tue, 26 Sep 2023 14:14:54 +0200
Subject: [PATCH 3/8] clang-format and compile bugs

---
 src/input/ParallelVertexFilter.h      | 50 +++++++--------
 src/meshreader/FidapReader.h          |  2 +-
 src/meshreader/GMSHBuilder.h          | 16 +++--
 src/meshreader/GambitReader.h         | 13 ++--
 src/meshreader/ParallelFidapReader.h  | 11 ++--
 src/meshreader/ParallelGambitReader.h | 24 +++----
 src/meshreader/ParallelMeshReader.h   |  6 +-
 src/pumgen.cpp                        | 90 +++++++++++++++------------
 8 files changed, 117 insertions(+), 95 deletions(-)

diff --git a/src/input/ParallelVertexFilter.h b/src/input/ParallelVertexFilter.h
index fdacf8b..96b1136 100644
--- a/src/input/ParallelVertexFilter.h
+++ b/src/input/ParallelVertexFilter.h
@@ -19,12 +19,12 @@
 #include <mpi.h>
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <cstdint>
 #include <cstring>
 #include <numeric>
 #include <vector>
-#include <array>
 
 #include "utils/logger.h"
 
@@ -76,8 +76,7 @@ class ParallelVertexFilter {
   std::vector<double> m_localVertices;
 
   public:
-  ParallelVertexFilter(MPI_Comm comm = MPI_COMM_WORLD)
-      : m_comm(comm), m_numLocalVertices(0) {
+  ParallelVertexFilter(MPI_Comm comm = MPI_COMM_WORLD) : m_comm(comm), m_numLocalVertices(0) {
     MPI_Comm_rank(comm, &m_rank);
     MPI_Comm_size(comm, &m_numProcs);
 
@@ -87,8 +86,7 @@ class ParallelVertexFilter {
     }
   }
 
-  virtual ~ParallelVertexFilter() {
-  }
+  virtual ~ParallelVertexFilter() {}
 
   /**
    * @param vertices Vertices that should be filtered, must have the size
@@ -96,7 +94,7 @@ class ParallelVertexFilter {
    */
   void filter(std::size_t numVertices, const std::vector<double>& vertices) {
     // Chop the last 4 bits to avoid numerical errors
-    std::vector<double> roundVertices (numVertices * 3);
+    std::vector<double> roundVertices(numVertices * 3);
     removeRoundError(vertices, roundVertices);
 
     // Create indices and sort them locally
@@ -126,8 +124,8 @@ class ParallelVertexFilter {
       allSplitters.resize(m_numProcs * (BUCKETS_PER_RANK - 1));
     }
 
-    MPI_Gather(localSplitters.data(), BUCKETS_PER_RANK - 1, MPI_DOUBLE, allSplitters.data(), BUCKETS_PER_RANK - 1,
-               MPI_DOUBLE, 0, m_comm);
+    MPI_Gather(localSplitters.data(), BUCKETS_PER_RANK - 1, MPI_DOUBLE, allSplitters.data(),
+               BUCKETS_PER_RANK - 1, MPI_DOUBLE, 0, m_comm);
 
     // Sort splitter elements
     if (m_rank == 0) {
@@ -135,7 +133,7 @@ class ParallelVertexFilter {
     }
 
     // Distribute splitter to all processes
-    std::vector<double> splitters (m_numProcs - 1);
+    std::vector<double> splitters(m_numProcs - 1);
 
     if (m_rank == 0) {
 #ifdef _OPENMP
@@ -152,7 +150,7 @@ class ParallelVertexFilter {
     MPI_Bcast(splitters.data(), m_numProcs - 1, MPI_DOUBLE, 0, m_comm);
 
     // Determine the bucket for each vertex
-    std::vector<std::size_t> bucket (numVertices);
+    std::vector<std::size_t> bucket(numVertices);
 
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
@@ -164,7 +162,7 @@ class ParallelVertexFilter {
     }
 
     // Determine the (local and total) bucket size
-    std::vector<int> bucketSize (m_numProcs);
+    std::vector<int> bucketSize(m_numProcs);
     for (std::size_t i = 0; i < numVertices; i++) {
       ++bucketSize[bucket[i]];
     }
@@ -183,7 +181,7 @@ class ParallelVertexFilter {
     }
 
     // Create sorted send buffer
-    std::vector<double> sendVertices (3 * numVertices);
+    std::vector<double> sendVertices(3 * numVertices);
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
@@ -192,29 +190,29 @@ class ParallelVertexFilter {
     }
 
     // Allocate buffer for the vertices and exchange them
-    std::vector<double> sortVertices (3 * numSortVertices);
+    std::vector<double> sortVertices(3 * numSortVertices);
 
-    std::vector<int> sDispls (m_numProcs);
-    std::vector<int> rDispls (m_numProcs);
+    std::vector<int> sDispls(m_numProcs);
+    std::vector<int> rDispls(m_numProcs);
     sDispls[0] = 0;
     rDispls[0] = 0;
     for (int i = 1; i < m_numProcs; i++) {
       sDispls[i] = sDispls[i - 1] + bucketSize[i - 1];
       rDispls[i] = rDispls[i - 1] + recvSize[i - 1];
     }
-    MPI_Alltoallv(sendVertices.data(), bucketSize.data(), sDispls.data(), vertexType, sortVertices.data(), recvSize.data(), rDispls.data(),
-                  vertexType, m_comm);
+    MPI_Alltoallv(sendVertices.data(), bucketSize.data(), sDispls.data(), vertexType,
+                  sortVertices.data(), recvSize.data(), rDispls.data(), vertexType, m_comm);
 
     // Chop the last 4 bits to avoid numerical errors
     roundVertices.resize(numSortVertices * 3);
     removeRoundError(sortVertices, roundVertices);
 
     // Create indices and sort them (such that the vertices are sorted)
-    std::vector<std::size_t> sortSortIndices (numSortVertices);
+    std::vector<std::size_t> sortSortIndices(numSortVertices);
     createSortedIndices(roundVertices, sortSortIndices);
 
     // Initialize the global ids we send back to the other processors
-    std::vector<std::size_t> gids (numSortVertices);
+    std::vector<std::size_t> gids(numSortVertices);
 
     if (numSortVertices > 0) {
       gids[sortSortIndices[0]] = 0;
@@ -222,8 +220,7 @@ class ParallelVertexFilter {
         if (equals(&sortVertices[sortSortIndices[i - 1] * 3],
                    &sortVertices[sortSortIndices[i] * 3])) {
           gids[sortSortIndices[i]] = gids[sortSortIndices[i - 1]];
-                   }
-        else {
+        } else {
           gids[sortSortIndices[i]] = gids[sortSortIndices[i - 1]] + 1;
         }
       }
@@ -232,8 +229,7 @@ class ParallelVertexFilter {
     // Create the local vertices list
     if (numSortVertices > 0) {
       m_numLocalVertices = gids[sortSortIndices[numSortVertices - 1]] + 1;
-    }
-    else {
+    } else {
       m_numLocalVertices = 0;
     }
 
@@ -256,8 +252,9 @@ class ParallelVertexFilter {
     }
 
     // Send result back
-    std::vector<std::size_t> globalIds (numVertices);
-    MPI_Alltoallv(gids.data(), recvSize.data(), rDispls.data(), tndm::mpi_type_t<std::size_t>(), globalIds.data(), bucketSize.data(), sDispls.data(),
+    std::vector<std::size_t> globalIds(numVertices);
+    MPI_Alltoallv(gids.data(), recvSize.data(), rDispls.data(), tndm::mpi_type_t<std::size_t>(),
+                  globalIds.data(), bucketSize.data(), sDispls.data(),
                   tndm::mpi_type_t<std::size_t>(), m_comm);
 
     // Assign the global ids to the correct vertices
@@ -318,7 +315,8 @@ class ParallelVertexFilter {
    * @param[out] roundValues The list of rounded values
    *  (the caller is responsible for allocating the memory)
    */
-  static void removeRoundError(const std::vector<double>& values, std::vector<double>& roundValues) {
+  static void removeRoundError(const std::vector<double>& values,
+                               std::vector<double>& roundValues) {
     assert(values.size() == roundValues.size());
 
 #ifdef _OPENMP
diff --git a/src/meshreader/FidapReader.h b/src/meshreader/FidapReader.h
index 4fa99c8..cbc39cc 100644
--- a/src/meshreader/FidapReader.h
+++ b/src/meshreader/FidapReader.h
@@ -14,11 +14,11 @@
 #define FIDAP_READER_H
 
 #include <algorithm>
+#include <array>
 #include <map>
 #include <sstream>
 #include <string>
 #include <vector>
-#include <array>
 
 #include "utils/stringutils.h"
 
diff --git a/src/meshreader/GMSHBuilder.h b/src/meshreader/GMSHBuilder.h
index c4ce0f7..1224920 100644
--- a/src/meshreader/GMSHBuilder.h
+++ b/src/meshreader/GMSHBuilder.h
@@ -10,10 +10,18 @@
 namespace puml {
 
 template <std::size_t D> struct GMSHSimplexType {};
-template <> struct GMSHSimplexType<0u> { static constexpr long type = 15; }; // 15 -> point
-template <> struct GMSHSimplexType<1u> { static constexpr long type = 1; };  // 1 -> line
-template <> struct GMSHSimplexType<2u> { static constexpr long type = 2; };  // 2 -> triangle
-template <> struct GMSHSimplexType<3u> { static constexpr long type = 4; };  // 4 -> tetrahedron
+template <> struct GMSHSimplexType<0u> {
+  static constexpr long type = 15;
+}; // 15 -> point
+template <> struct GMSHSimplexType<1u> {
+  static constexpr long type = 1;
+}; // 1 -> line
+template <> struct GMSHSimplexType<2u> {
+  static constexpr long type = 2;
+}; // 2 -> triangle
+template <> struct GMSHSimplexType<3u> {
+  static constexpr long type = 4;
+}; // 4 -> tetrahedron
 
 template <std::size_t D> class GMSHBuilder : public tndm::GMSHMeshBuilder {
   public:
diff --git a/src/meshreader/GambitReader.h b/src/meshreader/GambitReader.h
index 057aa1e..8410087 100644
--- a/src/meshreader/GambitReader.h
+++ b/src/meshreader/GambitReader.h
@@ -302,7 +302,7 @@ class GambitReader : public MeshReader {
     m_mesh.seekg(m_vertices.seekPosition + start * m_vertices.lineSize + m_vertices.lineSize -
                  3 * COORDINATE_SIZE - 1);
 
-    std::vector<char> buf(3* COORDINATE_SIZE);
+    std::vector<char> buf(3 * COORDINATE_SIZE);
 
     for (std::size_t i = 0; i < count; i++) {
       m_mesh.read(buf.data(), 3 * COORDINATE_SIZE);
@@ -444,8 +444,7 @@ class GambitReader : public MeshReader {
 
         if (section->variableLineLength) {
           buf.resize(0);
-        }
-        else {
+        } else {
           buf.resize(section->elementSize + section->elementTypeSize + section->faceSize);
         }
       }
@@ -456,13 +455,15 @@ class GambitReader : public MeshReader {
         m_mesh >> elementType;
         m_mesh >> boundaries[i].face;
       } else {
-        m_mesh.read(buf.data(), section->elementSize + section->elementTypeSize + section->faceSize);
+        m_mesh.read(buf.data(),
+                    section->elementSize + section->elementTypeSize + section->faceSize);
 
         std::istringstream ssE(std::string(buf.begin(), buf.begin() + section->elementSize));
         ssE >> boundaries[i].element;
 
-        std::istringstream ssF(
-            std::string(buf.begin() + section->elementSize + section->elementTypeSize, buf.begin() + section->elementSize + section->elementTypeSize + section->faceSize));
+        std::istringstream ssF(std::string(
+            buf.begin() + section->elementSize + section->elementTypeSize,
+            buf.begin() + section->elementSize + section->elementTypeSize + section->faceSize));
         ssF >> boundaries[i].face;
 
         // Seek to next position
diff --git a/src/meshreader/ParallelFidapReader.h b/src/meshreader/ParallelFidapReader.h
index 23122bc..dea856a 100644
--- a/src/meshreader/ParallelFidapReader.h
+++ b/src/meshreader/ParallelFidapReader.h
@@ -161,7 +161,8 @@ class ParallelFidapReader : public ParallelMeshReader<FidapReader> {
             // Face for another processor
             // Serials the struct to make sending easier
             int proc = v.vertices[1] / vertexChunk;
-            aggregator[proc - 1].insert(aggregator[proc - 1].end(), v.vertices.begin(), v.vertices.end());
+            aggregator[proc - 1].insert(aggregator[proc - 1].end(), v.vertices.begin(),
+                                        v.vertices.end());
             aggregator[proc - 1].push_back(faces[j].type);
           }
         }
@@ -172,7 +173,8 @@ class ParallelFidapReader : public ParallelMeshReader<FidapReader> {
             continue;
 
           sizes[j] = aggregator[j].size() / 4; // 3 vertices + face type
-          MPI_Isend(&sizes[j], 1, tndm::mpi_type_t<std::size_t>(), j + 1, 0, m_comm, &requests[j * 2]);
+          MPI_Isend(&sizes[j], 1, tndm::mpi_type_t<std::size_t>(), j + 1, 0, m_comm,
+                    &requests[j * 2]);
           MPI_Isend(&aggregator[j][0], aggregator[j].size(), MPI_INT, j + 1, 0, m_comm,
                     &requests[j * 2 + 1]);
         }
@@ -200,10 +202,11 @@ class ParallelFidapReader : public ParallelMeshReader<FidapReader> {
           // Finished
           break;
 
-        MPI_Recv(buf.data(), size * 4, tndm::mpi_type_t<std::size_t>(), 0, 0, m_comm, MPI_STATUS_IGNORE);
+        MPI_Recv(buf.data(), size * 4, tndm::mpi_type_t<std::size_t>(), 0, 0, m_comm,
+                 MPI_STATUS_IGNORE);
 
         for (std::size_t i = 0; i < size * 4; i += 4) {
-          FaceVertex v(buf[i], buf[i+1], buf[i+2]);
+          FaceVertex v(buf[i], buf[i + 1], buf[i + 2]);
           facePos.push_back(m_faceMap.at(v));
           faceType.push_back(buf[i + 3]);
         }
diff --git a/src/meshreader/ParallelGambitReader.h b/src/meshreader/ParallelGambitReader.h
index 06f2075..2bfc970 100644
--- a/src/meshreader/ParallelGambitReader.h
+++ b/src/meshreader/ParallelGambitReader.h
@@ -41,11 +41,11 @@ class ParallelGambitReader : public ParallelMeshReader<GambitReader> {
 
     if (m_rank == 0) {
       std::size_t maxChunkSize = chunkSize;
-      std::vector<ElementGroup> map (maxChunkSize);
+      std::vector<ElementGroup> map(maxChunkSize);
 
-      std::vector<std::vector<int>> aggregator (m_nProcs - 1);
-      std::vector<std::size_t> sizes (m_nProcs - 1);
-      std::vector<MPI_Request> requests ((m_nProcs - 1) * 2, MPI_REQUEST_NULL);
+      std::vector<std::vector<int>> aggregator(m_nProcs - 1);
+      std::vector<std::size_t> sizes(m_nProcs - 1);
+      std::vector<MPI_Request> requests((m_nProcs - 1) * 2, MPI_REQUEST_NULL);
 
       for (int i = 0; i < m_nProcs; i++) {
         logInfo() << "Reading group information part" << (i + 1) << "of" << m_nProcs;
@@ -83,7 +83,8 @@ class ParallelGambitReader : public ParallelMeshReader<GambitReader> {
           }
 
           sizes[j] = aggregator[j].size() / 2; // element id + group number
-          MPI_Isend(&sizes[j], 1, tndm::mpi_type_t<std::size_t>(), j + 1, 0, m_comm, &requests[j * 2]);
+          MPI_Isend(&sizes[j], 1, tndm::mpi_type_t<std::size_t>(), j + 1, 0, m_comm,
+                    &requests[j * 2]);
           MPI_Isend(&aggregator[j][0], aggregator[j].size(), MPI_INT, j + 1, 0, m_comm,
                     &requests[j * 2 + 1]);
         }
@@ -129,11 +130,11 @@ class ParallelGambitReader : public ParallelMeshReader<GambitReader> {
 
     if (m_rank == 0) {
       std::size_t maxChunkSize = chunkSize;
-      std::vector<GambitBoundaryFace> faces (maxChunkSize);
+      std::vector<GambitBoundaryFace> faces(maxChunkSize);
 
-      std::vector<std::vector<int>> aggregator (m_nProcs - 1);
-      std::vector<std::size_t> sizes (m_nProcs - 1);
-      std::vector<MPI_Request> requests ((m_nProcs - 1) * 2, MPI_REQUEST_NULL);
+      std::vector<std::vector<int>> aggregator(m_nProcs - 1);
+      std::vector<std::size_t> sizes(m_nProcs - 1);
+      std::vector<MPI_Request> requests((m_nProcs - 1) * 2, MPI_REQUEST_NULL);
 
       std::size_t nChunks = (nBoundaries() + chunkSize - 1) / chunkSize;
       for (std::size_t i = 0; i < nChunks; i++) {
@@ -171,7 +172,8 @@ class ParallelGambitReader : public ParallelMeshReader<GambitReader> {
           }
 
           sizes[j] = aggregator[j].size() / 2; // element id + face type
-          MPI_Isend(&sizes[j], 1, tndm::mpi_type_t<std::size_t>(), j + 1, 0, m_comm, &requests[j * 2]);
+          MPI_Isend(&sizes[j], 1, tndm::mpi_type_t<std::size_t>(), j + 1, 0, m_comm,
+                    &requests[j * 2]);
           MPI_Isend(&aggregator[j][0], aggregator[j].size(), MPI_INT, j + 1, 0, m_comm,
                     &requests[j * 2 + 1]);
         }
@@ -191,7 +193,7 @@ class ParallelGambitReader : public ParallelMeshReader<GambitReader> {
       }
 
       // Allocate enough memory
-      std::vector<int> buf (chunkSize * 2);
+      std::vector<int> buf(chunkSize * 2);
 
       while (true) {
         std::size_t size;
diff --git a/src/meshreader/ParallelMeshReader.h b/src/meshreader/ParallelMeshReader.h
index 73069b9..e76429f 100644
--- a/src/meshreader/ParallelMeshReader.h
+++ b/src/meshreader/ParallelMeshReader.h
@@ -166,7 +166,8 @@ template <class R> class ParallelMeshReader {
         logInfo() << "Reading elements part" << (m_nProcs - 1) << "of" << m_nProcs;
         m_serialReader.readElements((m_nProcs - 1) * chunkSize, lastChunkSize, elements);
         MPI_Wait(&request, MPI_STATUS_IGNORE);
-        MPI_Isend(elements, lastChunkSize * 4, tndm::mpi_type_t<std::size_t>(), m_nProcs - 1, 0, m_comm, &request);
+        MPI_Isend(elements, lastChunkSize * 4, tndm::mpi_type_t<std::size_t>(), m_nProcs - 1, 0,
+                  m_comm, &request);
         swap(elements, elements2);
       }
 
@@ -178,7 +179,8 @@ template <class R> class ParallelMeshReader {
       if (m_rank == m_nProcs - 1)
         chunkSize = m_nElements - (m_nProcs - 1) * chunkSize;
 
-      MPI_Recv(elements, chunkSize * 4, tndm::mpi_type_t<std::size_t>(), 0, 0, m_comm, MPI_STATUS_IGNORE);
+      MPI_Recv(elements, chunkSize * 4, tndm::mpi_type_t<std::size_t>(), 0, 0, m_comm,
+               MPI_STATUS_IGNORE);
     }
   }
 
diff --git a/src/pumgen.cpp b/src/pumgen.cpp
index 0fd84e5..da506b5 100644
--- a/src/pumgen.cpp
+++ b/src/pumgen.cpp
@@ -18,10 +18,10 @@
 #include <cassert>
 #include <cstring>
 #include <fstream>
+#include <functional>
 #include <limits>
-#include <string>
 #include <memory>
-#include <functional>
+#include <string>
 
 #include <hdf5.h>
 
@@ -57,7 +57,7 @@ template <typename TT> static TT _checkH5Err(TT&& status, const char* file, int
 
 #define checkH5Err(...) _checkH5Err(__VA_ARGS__, __FILE__, __LINE__)
 
-static int ilog(std::size_t value, int expbase=1) {
+static int ilog(std::size_t value, int expbase = 1) {
   int count = 0;
   while (value > 0) {
     value >>= expbase;
@@ -66,8 +66,7 @@ static int ilog(std::size_t value, int expbase=1) {
   return count;
 }
 
-template<typename F>
-static void iterate(apf::Mesh* mesh, int dim, F&& function) {
+template <typename F> static void iterate(apf::Mesh* mesh, int dim, F&& function) {
   apf::MeshIterator* it = mesh->begin(dim);
   while (apf::MeshEntity* element = mesh->iterate(it)) {
     std::invoke(function, element);
@@ -94,10 +93,16 @@ int main(int argc, char* argv[]) {
   args.addOption("model", 0, "Dump/Load a specific model file", utils::Args::Required, false);
   args.addOption("vtk", 0, "Dump mesh to VTK files", utils::Args::Required, false);
 
-  const char* filters[] = { "none", "scaleoffset", "deflate1", "deflate2", "deflate3", "deflate4", "deflate5", "deflate6", "deflate7", "deflate8", "deflate9" };
-  args.addOption("compactify-datatypes", 0, "Compress index and group data types to minimum byte size (no HDF5 filters)", utils::Args::Required, false);
-  args.addEnumOption("filter-enable", filters, 0, "Apply HDF5 filters (i.e. compression). Disabled by default.", false);
-  args.addOption("filter-chunksize", 0, "Chunksize for filters (default=4096).", utils::Args::Required, false);
+  const char* filters[] = {"none",     "scaleoffset", "deflate1", "deflate2",
+                           "deflate3", "deflate4",    "deflate5", "deflate6",
+                           "deflate7", "deflate8",    "deflate9"};
+  args.addOption("compactify-datatypes", 0,
+                 "Compress index and group data types to minimum byte size (no HDF5 filters)",
+                 utils::Args::Required, false);
+  args.addEnumOption("filter-enable", filters, 0,
+                     "Apply HDF5 filters (i.e. compression). Disabled by default.", false);
+  args.addOption("filter-chunksize", 0, "Chunksize for filters (default=4096).",
+                 utils::Args::Required, false);
 
   args.addOption("license", 'l', "License file (only used by SimModSuite)", utils::Args::Required,
                  false);
@@ -141,21 +146,21 @@ int main(int argc, char* argv[]) {
 
   bool reduceInts = args.isSet("compactify-datatypes");
   int filterEnable = args.getArgument("filter-enable", 0);
-  std::size_t filterChunksize = args.getArgument<std::size_t>("filter-chunksize", 4096);
+  hsize_t filterChunksize = args.getArgument<hsize_t>("filter-chunksize", 4096);
   bool applyFilters = filterEnable > 0;
   if (reduceInts) {
     logInfo(rank) << "Using compact integer types.";
   }
   if (filterEnable == 0) {
     logInfo(rank) << "No filtering enabled (contiguous storage)";
-  }
-  else {
+  } else {
     logInfo(rank) << "Using filtering. Chunksize:" << filterChunksize;
     if (filterEnable == 1) {
       logInfo(rank) << "Compression: scale-offset compression for integers (disabled for floats)";
-    }
-    else if (filterEnable < 11) {
-      logInfo(rank) << "Compression: deflate, strength" << filterEnable - 1 << "(note: this requires HDF5 to be compiled with GZIP support; this applies to SeisSol as well)";
+    } else if (filterEnable < 11) {
+      logInfo(rank) << "Compression: deflate, strength" << filterEnable - 1
+                    << "(note: this requires HDF5 to be compiled with GZIP support; this applies "
+                       "to SeisSol as well)";
     }
   }
 
@@ -165,8 +170,7 @@ int main(int argc, char* argv[]) {
   }
   if (utils::StringUtils::endsWith(outputFile, ".h5")) {
     utils::StringUtils::replaceLast(xdmfFile, ".h5", ".xdmf");
-  }
-  else {
+  } else {
     xdmfFile.append(".xdmf");
   }
 
@@ -267,7 +271,8 @@ int main(int argc, char* argv[]) {
   // Get local/global size
   std::size_t localSize[2] = {countOwnedLong(3), countOwnedLong(0)};
   std::size_t globalSize[2] = {localSize[0], localSize[1]};
-  MPI_Allreduce(MPI_IN_PLACE, globalSize, 2, tndm::mpi_type_t<std::size_t>(), MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, globalSize, 2, tndm::mpi_type_t<std::size_t>(), MPI_SUM,
+                MPI_COMM_WORLD);
 
   logInfo(rank) << "Mesh size:" << globalSize[0];
 
@@ -323,7 +328,8 @@ int main(int argc, char* argv[]) {
       std::size_t unsignedSize = (bits + 7) / 8;
       checkH5Err(connectType);
       checkH5Err(H5Tset_size(connectType, unsignedSize));
-      checkH5Err(H5Tcommit(h5file, "/connectType", connectType, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT));
+      checkH5Err(
+          H5Tcommit(h5file, "/connectType", connectType, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT));
     }
 
     hid_t connectFilter = H5P_DEFAULT;
@@ -333,15 +339,14 @@ int main(int argc, char* argv[]) {
       checkH5Err(H5Pset_chunk(connectFilter, 2, chunk));
       if (filterEnable == 1) {
         checkH5Err(H5Pset_scaleoffset(connectFilter, H5Z_SO_INT, H5Z_SO_INT_MINBITS_DEFAULT));
-      }
-      else if (filterEnable < 11) {
+      } else if (filterEnable < 11) {
         int deflateStrength = filterEnable - 1;
         checkH5Err(H5Pset_deflate(connectFilter, deflateStrength));
       }
     }
 
-    hid_t h5connect =
-        H5Dcreate(h5file, "/connect", connectType, h5space, H5P_DEFAULT, connectFilter, H5P_DEFAULT);
+    hid_t h5connect = H5Dcreate(h5file, "/connect", connectType, h5space, H5P_DEFAULT,
+                                connectFilter, H5P_DEFAULT);
     checkH5Err(h5connect);
 
     hsize_t start[2] = {offsets[0], 0};
@@ -367,7 +372,8 @@ int main(int argc, char* argv[]) {
     }
     mesh->end(it);
 
-    checkH5Err(H5Dwrite(h5connect, H5T_NATIVE_ULONG, h5memspace, h5space, h5dxlist, connect.data()));
+    checkH5Err(
+        H5Dwrite(h5connect, H5T_NATIVE_ULONG, h5memspace, h5space, h5dxlist, connect.data()));
 
     if (applyFilters) {
       checkH5Err(H5Pclose(connectFilter));
@@ -383,9 +389,9 @@ int main(int argc, char* argv[]) {
   // Vertices
   logInfo(rank) << "Writing vertices";
   {
-    hsize_t sizes[2] = {0,0};
-    hsize_t start[2] = {0,0};
-    hsize_t count[2] = {0,0};
+    hsize_t sizes[2] = {0, 0};
+    hsize_t start[2] = {0, 0};
+    hsize_t count[2] = {0, 0};
     sizes[0] = globalSize[1];
     sizes[1] = 3;
     hid_t h5space = H5Screate_simple(2, sizes, 0L);
@@ -398,16 +404,16 @@ int main(int argc, char* argv[]) {
       checkH5Err(H5Pset_chunk(geometryFilter, 2, chunk));
       if (filterEnable == 1) {
         // float compression disabled at the moment (would be lossy)
-        // checkH5Err(H5Pset_scaleoffset(geometryFilter, H5Z_SO_FLOAT_DSCALE, H5Z_SO_INT_MINBITS_DEFAULT));
-      }
-      else if (filterEnable < 11) {
+        // checkH5Err(H5Pset_scaleoffset(geometryFilter, H5Z_SO_FLOAT_DSCALE,
+        // H5Z_SO_INT_MINBITS_DEFAULT));
+      } else if (filterEnable < 11) {
         int deflateStrength = filterEnable - 1;
         checkH5Err(H5Pset_deflate(geometryFilter, deflateStrength));
       }
     }
 
     hid_t h5geometry = H5Dcreate(h5file, "/geometry", H5T_IEEE_F64LE, h5space, H5P_DEFAULT,
-                                geometryFilter, H5P_DEFAULT);
+                                 geometryFilter, H5P_DEFAULT);
     checkH5Err(h5geometry);
 
     start[0] = offsets[1];
@@ -441,7 +447,8 @@ int main(int argc, char* argv[]) {
     }
     mesh->end(it);
 
-    checkH5Err(H5Dwrite(h5geometry, H5T_NATIVE_DOUBLE, h5memspace, h5space, h5dxlist, geometry.data()));
+    checkH5Err(
+        H5Dwrite(h5geometry, H5T_NATIVE_DOUBLE, h5memspace, h5space, h5dxlist, geometry.data()));
 
     if (applyFilters && filterEnable > 1) {
       checkH5Err(H5Pclose(geometryFilter));
@@ -498,8 +505,7 @@ int main(int argc, char* argv[]) {
       checkH5Err(H5Pset_chunk(groupFilter, 1, chunk));
       if (filterEnable == 1) {
         checkH5Err(H5Pset_scaleoffset(groupFilter, H5Z_SO_INT, H5Z_SO_INT_MINBITS_DEFAULT));
-      }
-      else if (filterEnable < 11) {
+      } else if (filterEnable < 11) {
         int deflateStrength = filterEnable - 1;
         checkH5Err(H5Pset_deflate(groupFilter, deflateStrength));
       }
@@ -552,15 +558,14 @@ int main(int argc, char* argv[]) {
       checkH5Err(H5Pset_chunk(boundaryFilter, 1, chunk));
       if (filterEnable == 1) {
         checkH5Err(H5Pset_scaleoffset(boundaryFilter, H5Z_SO_INT, H5Z_SO_INT_MINBITS_DEFAULT));
-      }
-      else if (filterEnable < 11) {
+      } else if (filterEnable < 11) {
         int deflateStrength = filterEnable - 1;
         checkH5Err(H5Pset_deflate(boundaryFilter, deflateStrength));
       }
     }
 
-    hid_t h5boundary =
-        H5Dcreate(h5file, "/boundary", H5T_STD_I32LE, h5space, H5P_DEFAULT, boundaryFilter, H5P_DEFAULT);
+    hid_t h5boundary = H5Dcreate(h5file, "/boundary", H5T_STD_I32LE, h5space, H5P_DEFAULT,
+                                 boundaryFilter, H5P_DEFAULT);
     checkH5Err(h5boundary);
 
     start[0] = offsets[0];
@@ -595,7 +600,8 @@ int main(int argc, char* argv[]) {
     }
     mesh->end(it);
 
-    checkH5Err(H5Dwrite(h5boundary, H5T_NATIVE_INT, h5memspace, h5space, h5dxlist, boundary.data()));
+    checkH5Err(
+        H5Dwrite(h5boundary, H5T_NATIVE_INT, h5memspace, h5space, h5dxlist, boundary.data()));
 
     if (boundaryFilter) {
       checkH5Err(H5Pclose(boundaryFilter));
@@ -629,7 +635,8 @@ int main(int argc, char* argv[]) {
          << std::endl
          // This should be UInt but for some reason this does not work with
          // binary data
-         << "    <DataItem NumberType=\"Int\" Precision=\""<<connectSize<<"\" Format=\"HDF\" "
+         << "    <DataItem NumberType=\"Int\" Precision=\"" << connectSize
+         << "\" Format=\"HDF\" "
             "Dimensions=\""
          << globalSize[0] << " 4\">" << basename << ":/connect</DataItem>" << std::endl
          << "   </Topology>" << std::endl
@@ -640,7 +647,8 @@ int main(int argc, char* argv[]) {
          << ":/geometry</DataItem>" << std::endl
          << "   </Geometry>" << std::endl
          << "   <Attribute Name=\"group\" Center=\"Cell\">" << std::endl
-         << "    <DataItem  NumberType=\"Int\" Precision=\""<<groupSize<<"\" Format=\"HDF\" "
+         << "    <DataItem  NumberType=\"Int\" Precision=\"" << groupSize
+         << "\" Format=\"HDF\" "
             "Dimensions=\""
          << globalSize[0] << "\">" << basename << ":/group</DataItem>" << std::endl
          << "   </Attribute>" << std::endl

From df43bd8df5c7d772fe24c317dd0303dd9e9138f8 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Wed, 18 Oct 2023 10:43:57 +0200
Subject: [PATCH 4/8] Chunked writing

---
 src/pumgen.cpp | 438 ++++++++++++++++++-------------------------------
 1 file changed, 164 insertions(+), 274 deletions(-)

diff --git a/src/pumgen.cpp b/src/pumgen.cpp
index da506b5..5f3ef34 100644
--- a/src/pumgen.cpp
+++ b/src/pumgen.cpp
@@ -29,6 +29,7 @@
 #include <apfNumbering.h>
 // #include <apfZoltan.h>
 #include <maMesh.h>
+#include <type_traits>
 
 #include "third_party/MPITraits.h"
 #include "utils/args.h"
@@ -74,6 +75,112 @@ template <typename F> static void iterate(apf::Mesh* mesh, int dim, F&& function
   mesh->end(it);
 }
 
+constexpr std::size_t NoSecondDim = 0;
+
+template <typename T, std::size_t SecondDim, typename F>
+static void writeH5Data(F&& handler, hid_t h5file, const std::string& name, apf::Mesh* mesh, int meshdim, hid_t h5memtype, hid_t h5outtype, std::size_t chunk, std::size_t localSize, std::size_t globalSize, bool reduceInts, int filterEnable, std::size_t filterChunksize) {
+  constexpr std::size_t SecondSize = std::max(SecondDim, static_cast<std::size_t>(1));
+  constexpr std::size_t Dimensions = SecondDim == 0 ? 1 : 2;
+  const std::size_t chunkSize = chunk / SecondSize / sizeof(T);
+  const std::size_t bufferSize = std::min(localSize, chunkSize);
+  std::vector<T> data(SecondSize * bufferSize);
+
+  std::size_t rounds = (localSize + chunk - 1) / chunk;
+
+  MPI_Allreduce(MPI_IN_PLACE, &rounds, 1, tndm::mpi_type_t<decltype(rounds)>(), MPI_MAX, MPI_COMM_WORLD);
+
+  hsize_t globalSizes[2] = {globalSize, SecondDim};
+  hid_t h5space = H5Screate_simple(Dimensions, globalSizes, nullptr);
+
+  hsize_t sizes[2] = {bufferSize, SecondDim};
+  hid_t h5memspace = H5Screate_simple(Dimensions, sizes, nullptr);
+
+  std::size_t offset = localSize;
+
+  MPI_Scan(MPI_IN_PLACE, &offset, 1, tndm::mpi_type_t<decltype(offset)>(), MPI_SUM, MPI_COMM_WORLD);
+
+  offset -= localSize;
+
+  hsize_t start[2] = {offset, 0};
+  hsize_t count[2] = {bufferSize, SecondDim};
+
+  hid_t h5dxlist = H5Pcreate(H5P_DATASET_XFER);
+  checkH5Err(h5dxlist);
+  checkH5Err(H5Pset_dxpl_mpio(h5dxlist, H5FD_MPIO_COLLECTIVE));
+
+  hid_t h5type = h5outtype;
+  if (reduceInts && std::is_integral_v<T>) {
+    h5type = H5Tcopy(h5outtype);
+    std::size_t bits = ilog(globalSize);
+    std::size_t unsignedSize = (bits + 7) / 8;
+    checkH5Err(h5type);
+    checkH5Err(H5Tset_size(h5type, unsignedSize));
+    checkH5Err(
+        H5Tcommit(h5file, (std::string("/") + name + std::string("Type")).c_str(), h5type, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT));
+  }
+
+  hid_t h5filter = H5P_DEFAULT;
+  if (filterEnable > 0) {
+    h5filter = checkH5Err(H5Pcreate(H5P_DATASET_CREATE));
+    hsize_t chunk[2] = {std::min(filterChunksize, bufferSize), SecondDim};
+    checkH5Err(H5Pset_chunk(h5filter, 2, chunk));
+    if (filterEnable == 1 && std::is_integral_v<T>) {
+      checkH5Err(H5Pset_scaleoffset(h5filter, H5Z_SO_INT, H5Z_SO_INT_MINBITS_DEFAULT));
+    } else if (filterEnable < 11) {
+      int deflateStrength = filterEnable - 1;
+      checkH5Err(H5Pset_deflate(h5filter, deflateStrength));
+    }
+  }
+
+  hid_t h5data = H5Dcreate(h5file, (std::string("/") + name).c_str(), h5type, h5space, H5P_DEFAULT,
+                              h5filter, H5P_DEFAULT);
+
+  std::size_t written = 0;
+  std::size_t index = 0;
+
+  apf::MeshIterator* it = mesh->begin(meshdim);
+
+  hsize_t nullstart[2] = {0,0};
+
+  for (std::size_t i = 0; i < rounds; ++i) {
+    index = 0;
+    start[0] = offset + written;
+    count[0] = std::min(localSize - written, bufferSize);
+
+    while (apf::MeshEntity* element = mesh->iterate(it)) {
+      if (handler(element, data.begin() + index * SecondSize)) {
+        ++index;
+      }
+
+      if (index >= count[0]) {
+        break;
+      }
+    }
+
+    checkH5Err(H5Sselect_hyperslab(h5memspace, H5S_SELECT_SET, nullstart, nullptr, count, nullptr));
+
+    checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, nullptr, count, nullptr));
+
+    checkH5Err(
+        H5Dwrite(h5data, h5memtype, h5memspace, h5space, h5dxlist, data.data()));
+    
+    written += count[0];
+  }
+    
+  mesh->end(it);
+
+  if (filterEnable > 0) {
+    checkH5Err(H5Pclose(h5filter));
+  }
+  if (reduceInts) {
+    checkH5Err(H5Tclose(h5type));
+  }
+  checkH5Err(H5Sclose(h5space));
+  checkH5Err(H5Sclose(h5memspace));
+  checkH5Err(H5Dclose(h5data));
+  checkH5Err(H5Pclose(h5dxlist));
+}
+
 int main(int argc, char* argv[]) {
   int rank = 0;
   int processes = 1;
@@ -103,6 +210,8 @@ int main(int argc, char* argv[]) {
                      "Apply HDF5 filters (i.e. compression). Disabled by default.", false);
   args.addOption("filter-chunksize", 0, "Chunksize for filters (default=4096).",
                  utils::Args::Required, false);
+  args.addOption("chunksize", 0, "Chunksize for writing (default=1 GiB).",
+                 utils::Args::Required, false);
 
   args.addOption("license", 'l', "License file (only used by SimModSuite)", utils::Args::Required,
                  false);
@@ -139,11 +248,14 @@ int main(int argc, char* argv[]) {
     // Compute default output filename
     outputFile = inputFile;
     size_t dotPos = outputFile.find_last_of('.');
-    if (dotPos != std::string::npos)
+    if (dotPos != std::string::npos) {
       outputFile.erase(dotPos);
+    }
     outputFile.append(".puml.h5");
   }
 
+  hsize_t chunksize = args.getArgument<hsize_t>("filter-chunksize", static_cast<hsize_t>(1)<<30);
+
   bool reduceInts = args.isSet("compactify-datatypes");
   int filterEnable = args.getArgument("filter-enable", 0);
   hsize_t filterChunksize = args.getArgument<hsize_t>("filter-chunksize", 4096);
@@ -226,6 +338,8 @@ int main(int argc, char* argv[]) {
 
   mesh = meshInput->getMesh();
 
+  logInfo(rank) << "Parsed mesh successfully, writing output...";
+
   // Check mesh
   if (alignMdsMatches(mesh))
     logWarning() << "Fixed misaligned matches";
@@ -309,309 +423,85 @@ int main(int argc, char* argv[]) {
   checkH5Err(h5file);
   checkH5Err(H5Pclose(h5falist));
 
-  hid_t h5dxlist = H5Pcreate(H5P_DATASET_XFER);
-  checkH5Err(h5dxlist);
-  checkH5Err(H5Pset_dxpl_mpio(h5dxlist, H5FD_MPIO_COLLECTIVE));
-
   // Write cells
   std::size_t connectSize = 8;
   logInfo(rank) << "Writing cells";
-  {
-    hsize_t sizes[2] = {globalSize[0], 4};
-    hid_t h5space = H5Screate_simple(2, sizes, 0L);
-    checkH5Err(h5space);
-
-    hid_t connectType = H5T_STD_U64LE;
-    if (reduceInts) {
-      connectType = H5Tcopy(H5T_STD_U64LE);
-      std::size_t bits = ilog(globalSize[0]);
-      std::size_t unsignedSize = (bits + 7) / 8;
-      checkH5Err(connectType);
-      checkH5Err(H5Tset_size(connectType, unsignedSize));
-      checkH5Err(
-          H5Tcommit(h5file, "/connectType", connectType, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT));
-    }
+  writeH5Data<unsigned long, 4>([&](auto& element, auto&& data) {
+    apf::NewArray<long> vn;
+    apf::getElementNumbers(vertexNum, element, vn);
 
-    hid_t connectFilter = H5P_DEFAULT;
-    if (applyFilters) {
-      connectFilter = checkH5Err(H5Pcreate(H5P_DATASET_CREATE));
-      hsize_t chunk[2] = {std::min(filterChunksize, sizes[0]), 4};
-      checkH5Err(H5Pset_chunk(connectFilter, 2, chunk));
-      if (filterEnable == 1) {
-        checkH5Err(H5Pset_scaleoffset(connectFilter, H5Z_SO_INT, H5Z_SO_INT_MINBITS_DEFAULT));
-      } else if (filterEnable < 11) {
-        int deflateStrength = filterEnable - 1;
-        checkH5Err(H5Pset_deflate(connectFilter, deflateStrength));
-      }
+    for (int i = 0; i < 4; i++) {
+      *(data + i) = vn[i];
     }
-
-    hid_t h5connect = H5Dcreate(h5file, "/connect", connectType, h5space, H5P_DEFAULT,
-                                connectFilter, H5P_DEFAULT);
-    checkH5Err(h5connect);
-
-    hsize_t start[2] = {offsets[0], 0};
-    hsize_t count[2] = {localSize[0], 4};
-    checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
-
-    sizes[0] = localSize[0];
-    hid_t h5memspace = H5Screate_simple(2, sizes, 0L);
-    checkH5Err(h5memspace);
-
-    std::vector<unsigned long> connect(localSize[0] * 4);
-    it = mesh->begin(3);
-    std::size_t index = 0;
-    while (apf::MeshEntity* element = mesh->iterate(it)) {
-      apf::NewArray<long> vn;
-      apf::getElementNumbers(vertexNum, element, vn);
-
-      for (int i = 0; i < 4; i++) {
-        connect[index * 4 + i] = vn[i];
-      }
-
-      index++;
-    }
-    mesh->end(it);
-
-    checkH5Err(
-        H5Dwrite(h5connect, H5T_NATIVE_ULONG, h5memspace, h5space, h5dxlist, connect.data()));
-
-    if (applyFilters) {
-      checkH5Err(H5Pclose(connectFilter));
-    }
-    if (reduceInts) {
-      checkH5Err(H5Tclose(connectType));
-    }
-    checkH5Err(H5Sclose(h5space));
-    checkH5Err(H5Sclose(h5memspace));
-    checkH5Err(H5Dclose(h5connect));
-  }
+    return true;
+  }, h5file, "connect", mesh, 3, H5T_NATIVE_ULONG, H5T_STD_U64LE, chunksize, localSize[0], globalSize[0], reduceInts, filterEnable, filterChunksize);
 
   // Vertices
   logInfo(rank) << "Writing vertices";
-  {
-    hsize_t sizes[2] = {0, 0};
-    hsize_t start[2] = {0, 0};
-    hsize_t count[2] = {0, 0};
-    sizes[0] = globalSize[1];
-    sizes[1] = 3;
-    hid_t h5space = H5Screate_simple(2, sizes, 0L);
-    checkH5Err(h5space);
-
-    hid_t geometryFilter = H5P_DEFAULT;
-    if (applyFilters && filterEnable > 1) {
-      geometryFilter = checkH5Err(H5Pcreate(H5P_DATASET_CREATE));
-      hsize_t chunk[2] = {std::min(filterChunksize, sizes[0]), 3};
-      checkH5Err(H5Pset_chunk(geometryFilter, 2, chunk));
-      if (filterEnable == 1) {
-        // float compression disabled at the moment (would be lossy)
-        // checkH5Err(H5Pset_scaleoffset(geometryFilter, H5Z_SO_FLOAT_DSCALE,
-        // H5Z_SO_INT_MINBITS_DEFAULT));
-      } else if (filterEnable < 11) {
-        int deflateStrength = filterEnable - 1;
-        checkH5Err(H5Pset_deflate(geometryFilter, deflateStrength));
-      }
+  writeH5Data<double, 3>([&](auto& element, auto&& data) {
+    if (!sharing->isOwned(element)) {
+      return false;
     }
 
-    hid_t h5geometry = H5Dcreate(h5file, "/geometry", H5T_IEEE_F64LE, h5space, H5P_DEFAULT,
-                                 geometryFilter, H5P_DEFAULT);
-    checkH5Err(h5geometry);
+    /*long gid = apf::getNumber(vertexNum, apf::Node(element, 0));
 
-    start[0] = offsets[1];
-    count[0] = localSize[1];
-    count[1] = 3;
-    checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
+    if (gid != static_cast<long>(offsets[1] + index)) {
+      logError() << "Global vertex numbering is incorrect";
+    }*/
 
-    sizes[0] = localSize[1];
-    hid_t h5memspace = H5Screate_simple(2, sizes, 0L);
-    checkH5Err(h5memspace);
+    apf::Vector3 point;
+    mesh->getPoint(element, 0, point);
+    double geometry[3];
+    point.toArray(geometry);
 
-    std::vector<double> geometry(localSize[1] * 3);
-    it = mesh->begin(0);
-    std::size_t index = 0;
-    while (apf::MeshEntity* element = mesh->iterate(it)) {
-      if (!sharing->isOwned(element)) {
-        continue;
-      }
-
-      long gid = apf::getNumber(vertexNum, apf::Node(element, 0));
-
-      if (gid != static_cast<long>(offsets[1] + index)) {
-        logError() << "Global vertex numbering is incorrect";
-      }
-
-      apf::Vector3 point;
-      mesh->getPoint(element, 0, point);
-      point.toArray(&geometry[index * 3]);
+    *(data + 0) = geometry[0];
+    *(data + 1) = geometry[1];
+    *(data + 2) = geometry[2];
 
-      index++;
-    }
-    mesh->end(it);
-
-    checkH5Err(
-        H5Dwrite(h5geometry, H5T_NATIVE_DOUBLE, h5memspace, h5space, h5dxlist, geometry.data()));
-
-    if (applyFilters && filterEnable > 1) {
-      checkH5Err(H5Pclose(geometryFilter));
-    }
-
-    checkH5Err(H5Sclose(h5space));
-    checkH5Err(H5Sclose(h5memspace));
-    checkH5Err(H5Dclose(h5geometry));
-  }
+    return true;
+  }, h5file, "geometry", mesh, 0, H5T_IEEE_F64LE, H5T_IEEE_F64LE, chunksize, localSize[1], globalSize[1], reduceInts, filterEnable, filterChunksize);
 
   // Group information
   apf::MeshTag* groupTag = mesh->findTag("group");
+
   std::size_t groupSize = 4;
   if (groupTag) {
     logInfo(rank) << "Writing group information";
-
-    hsize_t sizes[1] = {0};
-    hsize_t start[1] = {0};
-    hsize_t count[1] = {0};
-
-    sizes[0] = globalSize[0];
-    hid_t h5space = H5Screate_simple(1, sizes, 0L);
-    checkH5Err(h5space);
-
-    std::vector<int> group(localSize[0]);
-    it = mesh->begin(3);
-    std::size_t index = 0;
-    while (apf::MeshEntity* element = mesh->iterate(it)) {
-      assert(mesh->hasTag(element, groupTag));
-
-      mesh->getIntTag(element, groupTag, &group[index]);
-      index++;
-    }
-    mesh->end(it);
-
-    hid_t groupType = H5T_STD_I32LE;
-    if (reduceInts) {
-      groupType = H5Tcopy(H5T_STD_I32LE);
-      uint32_t minvalue = std::max(-(*std::min_element(group.begin(), group.end()) + 1), 0);
-      uint32_t maxvalue = *std::max_element(group.begin(), group.end());
-      uint32_t totalmax = std::max(minvalue, maxvalue);
-      MPI_Allreduce(MPI_IN_PLACE, &totalmax, 1, MPI_UNSIGNED, MPI_MAX, MPI_COMM_WORLD);
-      std::size_t bits = ilog(totalmax);
-      std::size_t signedSize = (bits + 7 + 1) / 8;
-      checkH5Err(groupType);
-      checkH5Err(H5Tset_size(groupType, signedSize));
-      checkH5Err(H5Tcommit(h5file, "/groupType", groupType, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT));
-    }
-
-    hid_t groupFilter = H5P_DEFAULT;
-    if (applyFilters) {
-      groupFilter = checkH5Err(H5Pcreate(H5P_DATASET_CREATE));
-      hsize_t chunk[1] = {std::min(filterChunksize, sizes[0])};
-      checkH5Err(H5Pset_chunk(groupFilter, 1, chunk));
-      if (filterEnable == 1) {
-        checkH5Err(H5Pset_scaleoffset(groupFilter, H5Z_SO_INT, H5Z_SO_INT_MINBITS_DEFAULT));
-      } else if (filterEnable < 11) {
-        int deflateStrength = filterEnable - 1;
-        checkH5Err(H5Pset_deflate(groupFilter, deflateStrength));
-      }
-    }
-
-    hid_t h5group =
-        H5Dcreate(h5file, "/group", H5T_STD_I32LE, h5space, H5P_DEFAULT, groupFilter, H5P_DEFAULT);
-    checkH5Err(h5group);
-
-    start[0] = offsets[0];
-    count[0] = localSize[0];
-    checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
-
-    sizes[0] = localSize[0];
-    hid_t h5memspace = H5Screate_simple(1, sizes, 0L);
-    checkH5Err(h5memspace);
-
-    checkH5Err(H5Dwrite(h5group, H5T_NATIVE_INT, h5memspace, h5space, h5dxlist, group.data()));
-
-    if (applyFilters) {
-      checkH5Err(H5Pclose(groupFilter));
-    }
-    if (reduceInts) {
-      checkH5Err(H5Tclose(groupType));
-    }
-    checkH5Err(H5Sclose(h5space));
-    checkH5Err(H5Sclose(h5memspace));
-    checkH5Err(H5Dclose(h5group));
+    writeH5Data<int, NoSecondDim>([&](auto& element, auto&& data) {
+      int group;
+      mesh->getIntTag(element, groupTag, &group);
+      *data = group;
+      return true;
+    }, h5file, "group", mesh, 3, H5T_NATIVE_INT, H5T_STD_I32LE, chunksize, localSize[0], globalSize[0], reduceInts, filterEnable, filterChunksize);
   } else {
     logInfo() << "No group information found in mesh";
   }
 
   // Write boundary condition
   logInfo(rank) << "Writing boundary condition";
-  {
-    apf::MeshTag* boundaryTag = mesh->findTag("boundary condition");
-    assert(boundaryTag);
-
-    hsize_t sizes[1] = {0};
-    hsize_t start[1] = {0};
-    hsize_t count[1] = {0};
-    sizes[0] = globalSize[0];
-    hid_t h5space = H5Screate_simple(1, sizes, 0L);
-    checkH5Err(h5space);
-
-    hid_t boundaryFilter = H5P_DEFAULT;
-    if (applyFilters) {
-      boundaryFilter = checkH5Err(H5Pcreate(H5P_DATASET_CREATE));
-      hsize_t chunk[1] = {std::min(filterChunksize, sizes[0])};
-      checkH5Err(H5Pset_chunk(boundaryFilter, 1, chunk));
-      if (filterEnable == 1) {
-        checkH5Err(H5Pset_scaleoffset(boundaryFilter, H5Z_SO_INT, H5Z_SO_INT_MINBITS_DEFAULT));
-      } else if (filterEnable < 11) {
-        int deflateStrength = filterEnable - 1;
-        checkH5Err(H5Pset_deflate(boundaryFilter, deflateStrength));
-      }
-    }
-
-    hid_t h5boundary = H5Dcreate(h5file, "/boundary", H5T_STD_I32LE, h5space, H5P_DEFAULT,
-                                 boundaryFilter, H5P_DEFAULT);
-    checkH5Err(h5boundary);
-
-    start[0] = offsets[0];
-    count[0] = localSize[0];
-    checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, 0L, count, 0L));
-
-    sizes[0] = localSize[0];
-    hid_t h5memspace = H5Screate_simple(1, sizes, 0L);
-    checkH5Err(h5memspace);
-
-    std::vector<int> boundary(localSize[0]);
-
-    it = mesh->begin(3);
-    std::size_t index = 0;
-    while (apf::MeshEntity* element = mesh->iterate(it)) {
-      apf::Downward faces;
-      mesh->getDownward(element, 2, faces);
-
-      for (int i = 0; i < 4; i++) {
-        if (mesh->hasTag(faces[i], boundaryTag)) {
-          int b;
-          mesh->getIntTag(faces[i], boundaryTag, &b);
-
-          if (b <= 0 || b > std::numeric_limits<unsigned char>::max())
-            logError() << "Cannot handle boundary condition" << b;
-
-          boundary[index] += b << (i * 8);
+  apf::MeshTag* boundaryTag = mesh->findTag("boundary condition");
+  assert(boundaryTag);
+  writeH5Data<int, NoSecondDim>([&](auto& element, auto&& data) {
+    int boundary = 0;
+    apf::Downward faces;
+    mesh->getDownward(element, 2, faces);
+
+    for (int i = 0; i < 4; i++) {
+      if (mesh->hasTag(faces[i], boundaryTag)) {
+        int b;
+        mesh->getIntTag(faces[i], boundaryTag, &b);
+
+        if (b <= 0 || b > std::numeric_limits<unsigned char>::max()) {
+          logError() << "Cannot handle boundary condition" << b;
         }
-      }
 
-      index++;
-    }
-    mesh->end(it);
-
-    checkH5Err(
-        H5Dwrite(h5boundary, H5T_NATIVE_INT, h5memspace, h5space, h5dxlist, boundary.data()));
-
-    if (boundaryFilter) {
-      checkH5Err(H5Pclose(boundaryFilter));
+        boundary |= b << (i * 8);
+      }
     }
-    checkH5Err(H5Sclose(h5space));
-    checkH5Err(H5Sclose(h5memspace));
-    checkH5Err(H5Dclose(h5boundary));
-  }
 
-  checkH5Err(H5Pclose(h5dxlist));
+    *data = boundary;
+    return true;
+  }, h5file, "boundary", mesh, 3, H5T_NATIVE_INT, H5T_STD_I32LE, chunksize, localSize[0], globalSize[0], reduceInts, filterEnable, filterChunksize);
 
   // Writing XDMF file
   if (rank == 0) {

From 2067a615712ac2674c8f976afced5befa22a3c0b Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Wed, 18 Oct 2023 17:44:46 +0200
Subject: [PATCH 5/8] Formatting and 64-bit boundary support

---
 src/pumgen.cpp | 162 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 99 insertions(+), 63 deletions(-)

diff --git a/src/pumgen.cpp b/src/pumgen.cpp
index 5f3ef34..8fee46c 100644
--- a/src/pumgen.cpp
+++ b/src/pumgen.cpp
@@ -78,7 +78,10 @@ template <typename F> static void iterate(apf::Mesh* mesh, int dim, F&& function
 constexpr std::size_t NoSecondDim = 0;
 
 template <typename T, std::size_t SecondDim, typename F>
-static void writeH5Data(F&& handler, hid_t h5file, const std::string& name, apf::Mesh* mesh, int meshdim, hid_t h5memtype, hid_t h5outtype, std::size_t chunk, std::size_t localSize, std::size_t globalSize, bool reduceInts, int filterEnable, std::size_t filterChunksize) {
+static void writeH5Data(F&& handler, hid_t h5file, const std::string& name, apf::Mesh* mesh,
+                        int meshdim, hid_t h5memtype, hid_t h5outtype, std::size_t chunk,
+                        std::size_t localSize, std::size_t globalSize, bool reduceInts,
+                        int filterEnable, std::size_t filterChunksize) {
   constexpr std::size_t SecondSize = std::max(SecondDim, static_cast<std::size_t>(1));
   constexpr std::size_t Dimensions = SecondDim == 0 ? 1 : 2;
   const std::size_t chunkSize = chunk / SecondSize / sizeof(T);
@@ -87,7 +90,8 @@ static void writeH5Data(F&& handler, hid_t h5file, const std::string& name, apf:
 
   std::size_t rounds = (localSize + chunk - 1) / chunk;
 
-  MPI_Allreduce(MPI_IN_PLACE, &rounds, 1, tndm::mpi_type_t<decltype(rounds)>(), MPI_MAX, MPI_COMM_WORLD);
+  MPI_Allreduce(MPI_IN_PLACE, &rounds, 1, tndm::mpi_type_t<decltype(rounds)>(), MPI_MAX,
+                MPI_COMM_WORLD);
 
   hsize_t globalSizes[2] = {globalSize, SecondDim};
   hid_t h5space = H5Screate_simple(Dimensions, globalSizes, nullptr);
@@ -115,8 +119,8 @@ static void writeH5Data(F&& handler, hid_t h5file, const std::string& name, apf:
     std::size_t unsignedSize = (bits + 7) / 8;
     checkH5Err(h5type);
     checkH5Err(H5Tset_size(h5type, unsignedSize));
-    checkH5Err(
-        H5Tcommit(h5file, (std::string("/") + name + std::string("Type")).c_str(), h5type, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT));
+    checkH5Err(H5Tcommit(h5file, (std::string("/") + name + std::string("Type")).c_str(), h5type,
+                         H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT));
   }
 
   hid_t h5filter = H5P_DEFAULT;
@@ -133,14 +137,14 @@ static void writeH5Data(F&& handler, hid_t h5file, const std::string& name, apf:
   }
 
   hid_t h5data = H5Dcreate(h5file, (std::string("/") + name).c_str(), h5type, h5space, H5P_DEFAULT,
-                              h5filter, H5P_DEFAULT);
+                           h5filter, H5P_DEFAULT);
 
   std::size_t written = 0;
   std::size_t index = 0;
 
   apf::MeshIterator* it = mesh->begin(meshdim);
 
-  hsize_t nullstart[2] = {0,0};
+  hsize_t nullstart[2] = {0, 0};
 
   for (std::size_t i = 0; i < rounds; ++i) {
     index = 0;
@@ -161,12 +165,11 @@ static void writeH5Data(F&& handler, hid_t h5file, const std::string& name, apf:
 
     checkH5Err(H5Sselect_hyperslab(h5space, H5S_SELECT_SET, start, nullptr, count, nullptr));
 
-    checkH5Err(
-        H5Dwrite(h5data, h5memtype, h5memspace, h5space, h5dxlist, data.data()));
-    
+    checkH5Err(H5Dwrite(h5data, h5memtype, h5memspace, h5space, h5dxlist, data.data()));
+
     written += count[0];
   }
-    
+
   mesh->end(it);
 
   if (filterEnable > 0) {
@@ -210,8 +213,11 @@ int main(int argc, char* argv[]) {
                      "Apply HDF5 filters (i.e. compression). Disabled by default.", false);
   args.addOption("filter-chunksize", 0, "Chunksize for filters (default=4096).",
                  utils::Args::Required, false);
-  args.addOption("chunksize", 0, "Chunksize for writing (default=1 GiB).",
-                 utils::Args::Required, false);
+  args.addOption("chunksize", 0, "Chunksize for writing (default=1 GiB).", utils::Args::Required,
+                 false);
+  const char* boundarytypes[] = {"int32", "int64"};
+  args.addEnumOption("boundarytype", boundarytypes, 0,
+                     "Type for writing out boundary data (default: int32).", false);
 
   args.addOption("license", 'l', "License file (only used by SimModSuite)", utils::Args::Required,
                  false);
@@ -254,7 +260,7 @@ int main(int argc, char* argv[]) {
     outputFile.append(".puml.h5");
   }
 
-  hsize_t chunksize = args.getArgument<hsize_t>("filter-chunksize", static_cast<hsize_t>(1)<<30);
+  hsize_t chunksize = args.getArgument<hsize_t>("filter-chunksize", static_cast<hsize_t>(1) << 30);
 
   bool reduceInts = args.isSet("compactify-datatypes");
   int filterEnable = args.getArgument("filter-enable", 0);
@@ -276,6 +282,19 @@ int main(int argc, char* argv[]) {
     }
   }
 
+  int boundaryType = args.getArgument("boundarytype", 0);
+  hid_t boundaryDatatype;
+  int faceOffset;
+  if (boundaryType == 0) {
+    boundaryDatatype = H5T_STD_I32LE;
+    faceOffset = 8;
+    logInfo(rank) << "Using 32-bit integer boundary type conditions.";
+  } else if (boundaryType == 1) {
+    boundaryDatatype = H5T_STD_I64LE;
+    faceOffset = 16;
+    logInfo(rank) << "Using 64-bit integer boundary type conditions.";
+  }
+
   std::string xdmfFile = outputFile;
   if (utils::StringUtils::endsWith(outputFile, ".puml.h5")) {
     utils::StringUtils::replaceLast(xdmfFile, ".puml.h5", ".xdmf");
@@ -426,40 +445,46 @@ int main(int argc, char* argv[]) {
   // Write cells
   std::size_t connectSize = 8;
   logInfo(rank) << "Writing cells";
-  writeH5Data<unsigned long, 4>([&](auto& element, auto&& data) {
-    apf::NewArray<long> vn;
-    apf::getElementNumbers(vertexNum, element, vn);
+  writeH5Data<unsigned long, 4>(
+      [&](auto& element, auto&& data) {
+        apf::NewArray<long> vn;
+        apf::getElementNumbers(vertexNum, element, vn);
 
-    for (int i = 0; i < 4; i++) {
-      *(data + i) = vn[i];
-    }
-    return true;
-  }, h5file, "connect", mesh, 3, H5T_NATIVE_ULONG, H5T_STD_U64LE, chunksize, localSize[0], globalSize[0], reduceInts, filterEnable, filterChunksize);
+        for (int i = 0; i < 4; i++) {
+          *(data + i) = vn[i];
+        }
+        return true;
+      },
+      h5file, "connect", mesh, 3, H5T_NATIVE_ULONG, H5T_STD_U64LE, chunksize, localSize[0],
+      globalSize[0], reduceInts, filterEnable, filterChunksize);
 
   // Vertices
   logInfo(rank) << "Writing vertices";
-  writeH5Data<double, 3>([&](auto& element, auto&& data) {
-    if (!sharing->isOwned(element)) {
-      return false;
-    }
+  writeH5Data<double, 3>(
+      [&](auto& element, auto&& data) {
+        if (!sharing->isOwned(element)) {
+          return false;
+        }
 
-    /*long gid = apf::getNumber(vertexNum, apf::Node(element, 0));
+        /*long gid = apf::getNumber(vertexNum, apf::Node(element, 0));
 
-    if (gid != static_cast<long>(offsets[1] + index)) {
-      logError() << "Global vertex numbering is incorrect";
-    }*/
+        if (gid != static_cast<long>(offsets[1] + index)) {
+          logError() << "Global vertex numbering is incorrect";
+        }*/
 
-    apf::Vector3 point;
-    mesh->getPoint(element, 0, point);
-    double geometry[3];
-    point.toArray(geometry);
+        apf::Vector3 point;
+        mesh->getPoint(element, 0, point);
+        double geometry[3];
+        point.toArray(geometry);
 
-    *(data + 0) = geometry[0];
-    *(data + 1) = geometry[1];
-    *(data + 2) = geometry[2];
+        *(data + 0) = geometry[0];
+        *(data + 1) = geometry[1];
+        *(data + 2) = geometry[2];
 
-    return true;
-  }, h5file, "geometry", mesh, 0, H5T_IEEE_F64LE, H5T_IEEE_F64LE, chunksize, localSize[1], globalSize[1], reduceInts, filterEnable, filterChunksize);
+        return true;
+      },
+      h5file, "geometry", mesh, 0, H5T_IEEE_F64LE, H5T_IEEE_F64LE, chunksize, localSize[1],
+      globalSize[1], reduceInts, filterEnable, filterChunksize);
 
   // Group information
   apf::MeshTag* groupTag = mesh->findTag("group");
@@ -467,12 +492,15 @@ int main(int argc, char* argv[]) {
   std::size_t groupSize = 4;
   if (groupTag) {
     logInfo(rank) << "Writing group information";
-    writeH5Data<int, NoSecondDim>([&](auto& element, auto&& data) {
-      int group;
-      mesh->getIntTag(element, groupTag, &group);
-      *data = group;
-      return true;
-    }, h5file, "group", mesh, 3, H5T_NATIVE_INT, H5T_STD_I32LE, chunksize, localSize[0], globalSize[0], reduceInts, filterEnable, filterChunksize);
+    writeH5Data<int, NoSecondDim>(
+        [&](auto& element, auto&& data) {
+          int group;
+          mesh->getIntTag(element, groupTag, &group);
+          *data = group;
+          return true;
+        },
+        h5file, "group", mesh, 3, H5T_NATIVE_INT, H5T_STD_I32LE, chunksize, localSize[0],
+        globalSize[0], reduceInts, filterEnable, filterChunksize);
   } else {
     logInfo() << "No group information found in mesh";
   }
@@ -481,27 +509,35 @@ int main(int argc, char* argv[]) {
   logInfo(rank) << "Writing boundary condition";
   apf::MeshTag* boundaryTag = mesh->findTag("boundary condition");
   assert(boundaryTag);
-  writeH5Data<int, NoSecondDim>([&](auto& element, auto&& data) {
-    int boundary = 0;
-    apf::Downward faces;
-    mesh->getDownward(element, 2, faces);
-
-    for (int i = 0; i < 4; i++) {
-      if (mesh->hasTag(faces[i], boundaryTag)) {
-        int b;
-        mesh->getIntTag(faces[i], boundaryTag, &b);
-
-        if (b <= 0 || b > std::numeric_limits<unsigned char>::max()) {
-          logError() << "Cannot handle boundary condition" << b;
+  auto i32limit = std::numeric_limits<unsigned char>::max();
+  auto i64limit = std::numeric_limits<unsigned short>::max();
+  writeH5Data<long, NoSecondDim>(
+      [&](auto& element, auto&& data) {
+        long boundary = 0;
+        apf::Downward faces;
+        mesh->getDownward(element, 2, faces);
+
+        for (int i = 0; i < 4; i++) {
+          if (mesh->hasTag(faces[i], boundaryTag)) {
+            int b;
+            mesh->getIntTag(faces[i], boundaryTag, &b);
+
+            if (b <= 0 || (b > i32limit && boundaryType == 0) ||
+                (b > i64limit && boundaryType == 1)) {
+              logError() << "Cannot handle boundary condition" << b;
+            }
+
+            // NOTE: this fails for boundary values per face larger than 2**31 - 1 due to sign
+            // extension
+            boundary |= static_cast<long>(b) << (i * faceOffset);
+          }
         }
 
-        boundary |= b << (i * 8);
-      }
-    }
-
-    *data = boundary;
-    return true;
-  }, h5file, "boundary", mesh, 3, H5T_NATIVE_INT, H5T_STD_I32LE, chunksize, localSize[0], globalSize[0], reduceInts, filterEnable, filterChunksize);
+        *data = boundary;
+        return true;
+      },
+      h5file, "boundary", mesh, 3, H5T_NATIVE_LONG, boundaryDatatype, chunksize, localSize[0],
+      globalSize[0], reduceInts, filterEnable, filterChunksize);
 
   // Writing XDMF file
   if (rank == 0) {

From e3dac916facbcb422e8af368f24fa579ccb9d07a Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Wed, 18 Oct 2023 22:24:57 +0200
Subject: [PATCH 6/8] Fix Xdmf file

---
 src/pumgen.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pumgen.cpp b/src/pumgen.cpp
index 8fee46c..45dd8ba 100644
--- a/src/pumgen.cpp
+++ b/src/pumgen.cpp
@@ -579,9 +579,9 @@ int main(int argc, char* argv[]) {
          << globalSize[0] << "\">" << basename << ":/group</DataItem>" << std::endl
          << "   </Attribute>" << std::endl
          << "   <Attribute Name=\"boundary\" Center=\"Cell\">" << std::endl
-         << "    <DataItem NumberType=\"Int\" Precision=\"4\" Format=\"HDF\" "
-            "Dimensions=\""
-         << globalSize[0] << "\">" << basename << ":/boundary</DataItem>" << std::endl
+         << "    <DataItem NumberType=\"Int\" Precision=\"" << (boundaryType == 0 ? "4" : "8")
+         << "\" Format=\"HDF\" Dimensions=\"" << globalSize[0] << "\">" << basename
+         << ":/boundary</DataItem>" << std::endl
          << "   </Attribute>" << std::endl
          << "  </Grid>" << std::endl
          << " </Domain>" << std::endl

From 57b86612c401ac8abf8d4997f711c5b8400e70c0 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Tue, 26 Dec 2023 21:13:22 +0100
Subject: [PATCH 7/8] Make clang-format happy

---
 src/meshreader/GMSHBuilder.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/meshreader/GMSHBuilder.h b/src/meshreader/GMSHBuilder.h
index 1224920..3c547ac 100644
--- a/src/meshreader/GMSHBuilder.h
+++ b/src/meshreader/GMSHBuilder.h
@@ -10,6 +10,9 @@
 namespace puml {
 
 template <std::size_t D> struct GMSHSimplexType {};
+
+// clang format seems to mess up the following lines from time to time, hence we disable it
+// clang-format off
 template <> struct GMSHSimplexType<0u> {
   static constexpr long type = 15;
 }; // 15 -> point
@@ -22,6 +25,7 @@ template <> struct GMSHSimplexType<2u> {
 template <> struct GMSHSimplexType<3u> {
   static constexpr long type = 4;
 }; // 4 -> tetrahedron
+// clang-format on
 
 template <std::size_t D> class GMSHBuilder : public tndm::GMSHMeshBuilder {
   public:

From c3d722202147d25003983bf3bd7e629afcb25510 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Wed, 27 Dec 2023 20:09:05 +0100
Subject: [PATCH 8/8] Rename fields

---
 src/pumgen.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pumgen.cpp b/src/pumgen.cpp
index 45dd8ba..b16fec5 100644
--- a/src/pumgen.cpp
+++ b/src/pumgen.cpp
@@ -443,7 +443,7 @@ int main(int argc, char* argv[]) {
   checkH5Err(H5Pclose(h5falist));
 
   // Write cells
-  std::size_t connectSize = 8;
+  std::size_t connectBytesPerData = 8;
   logInfo(rank) << "Writing cells";
   writeH5Data<unsigned long, 4>(
       [&](auto& element, auto&& data) {
@@ -489,7 +489,7 @@ int main(int argc, char* argv[]) {
   // Group information
   apf::MeshTag* groupTag = mesh->findTag("group");
 
-  std::size_t groupSize = 4;
+  std::size_t groupBytesPerData = 4;
   if (groupTag) {
     logInfo(rank) << "Writing group information";
     writeH5Data<int, NoSecondDim>(
@@ -561,7 +561,7 @@ int main(int argc, char* argv[]) {
          << std::endl
          // This should be UInt but for some reason this does not work with
          // binary data
-         << "    <DataItem NumberType=\"Int\" Precision=\"" << connectSize
+         << "    <DataItem NumberType=\"Int\" Precision=\"" << connectBytesPerData
          << "\" Format=\"HDF\" "
             "Dimensions=\""
          << globalSize[0] << " 4\">" << basename << ":/connect</DataItem>" << std::endl
@@ -573,7 +573,7 @@ int main(int argc, char* argv[]) {
          << ":/geometry</DataItem>" << std::endl
          << "   </Geometry>" << std::endl
          << "   <Attribute Name=\"group\" Center=\"Cell\">" << std::endl
-         << "    <DataItem  NumberType=\"Int\" Precision=\"" << groupSize
+         << "    <DataItem  NumberType=\"Int\" Precision=\"" << groupBytesPerData
          << "\" Format=\"HDF\" "
             "Dimensions=\""
          << globalSize[0] << "\">" << basename << ":/group</DataItem>" << std::endl