From 999a913041a1434a324dd052bab1fa5e6f15bbf5 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Fri, 22 Nov 2024 18:11:21 -0500 Subject: [PATCH 01/16] Add code to build and write the tree from the data scanners --- .../checksum/ContainerMerkleTree.java | 19 +++++++++------ .../keyvalue/KeyValueContainerCheck.java | 24 +++++++++++++------ .../container/keyvalue/KeyValueHandler.java | 5 ++-- .../BackgroundContainerDataScanner.java | 7 +++--- .../OnDemandContainerDataScanner.java | 23 ++++++++++-------- .../container/ozoneimpl/OzoneContainer.java | 2 +- .../ContainerMerkleTreeTestUtils.java | 5 ++-- .../checksum/TestContainerMerkleTree.java | 18 +++++++------- 8 files changed, 60 insertions(+), 43 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java index 7dba5b4309c..56578dc9f1e 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java @@ -56,10 +56,12 @@ public ContainerMerkleTree() { * If the block entry already exists, the chunks will be added to the existing chunks for that block. * * @param blockID The ID of the block that these chunks belong to. + * @param healthy True if there were no errors detected with these chunks. False indicates that all the chunks + * being added had errors. * @param chunks A list of chunks to add to this block. The chunks will be sorted internally by their offset. */ - public void addChunks(long blockID, Collection chunks) { - id2Block.computeIfAbsent(blockID, BlockMerkleTree::new).addChunks(chunks); + public void addChunks(long blockID, boolean healthy, ContainerProtos.ChunkInfo... chunks) { + id2Block.computeIfAbsent(blockID, BlockMerkleTree::new).addChunks(healthy, chunks); } /** @@ -106,11 +108,13 @@ private static class BlockMerkleTree { * Adds the specified chunks to this block. The offset value of the chunk must be unique within the block, * otherwise it will overwrite the previous value at that offset. * + * @param healthy True if there were no errors detected with these chunks. False indicates that all the chunks + * being added had errors. * @param chunks A list of chunks to add to this block. */ - public void addChunks(Collection chunks) { + public void addChunks(boolean healthy, ContainerProtos.ChunkInfo... chunks) { for (ContainerProtos.ChunkInfo chunk: chunks) { - offset2Chunk.put(chunk.getOffset(), new ChunkMerkleTree(chunk)); + offset2Chunk.put(chunk.getOffset(), new ChunkMerkleTree(chunk, healthy)); } } @@ -151,11 +155,12 @@ public ContainerProtos.BlockMerkleTree toProto() { * This class computes one checksum for the whole chunk by aggregating these. */ private static class ChunkMerkleTree { - private ContainerProtos.ChunkInfo chunk; - private boolean isHealthy = true; + private final ContainerProtos.ChunkInfo chunk; + private final boolean isHealthy; - ChunkMerkleTree(ContainerProtos.ChunkInfo chunk) { + ChunkMerkleTree(ContainerProtos.ChunkInfo chunk, boolean healthy) { this.chunk = chunk; + this.isHealthy = healthy; } /** diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java index 080e4611dd9..5a9f1123ada 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java @@ -192,7 +192,6 @@ public DataScanResult fullCheck(DataTransferThrottler throttler, Canceler cancel LOG.debug("Running data checks for container {}", containerID); try { - // TODO HDDS-10374 this tree will get updated with the container's contents as it is scanned. ContainerMerkleTree dataTree = new ContainerMerkleTree(); List dataErrors = scanData(dataTree, throttler, canceler); if (containerIsDeleted()) { @@ -422,6 +421,11 @@ private static List verifyChecksum(BlockData block, List scanErrors = new ArrayList<>(); + // Information used to populate the merkle tree. + ContainerProtos.ChunkInfo.Builder observedChunkBuilder = chunk.toBuilder(); + ContainerProtos.ChecksumData.Builder observedChecksums = chunk.getChecksumData().toBuilder(); + boolean chunkHealthy = true; + ChecksumData checksumData = ChecksumData.getFromProtoBuf(chunk.getChecksumData()); int checksumCount = checksumData.getChecksums().size(); @@ -434,10 +438,7 @@ private static List verifyChecksum(BlockData block, if (layout == ContainerLayoutVersion.FILE_PER_BLOCK) { channel.position(chunk.getOffset()); } - // Only report one error per chunk. Reporting corruption at every "bytes per checksum" interval will lead to a - // large amount of errors when a full chunk is corrupted. - boolean chunkHealthy = true; - for (int i = 0; i < checksumCount && chunkHealthy; i++) { + for (int i = 0; i < checksumCount; i++) { // limit last read for FILE_PER_BLOCK, to avoid reading next chunk if (layout == ContainerLayoutVersion.FILE_PER_BLOCK && i == checksumCount - 1 && @@ -457,7 +458,11 @@ private static List verifyChecksum(BlockData block, ByteString expected = checksumData.getChecksums().get(i); ByteString actual = cal.computeChecksum(buffer) .getChecksums().get(0); - if (!expected.equals(actual)) { + observedChecksums.addChecksums(actual); + // Only report one error per chunk. Reporting corruption at every "bytes per checksum" interval will lead to a + // large amount of errors when a full chunk is corrupted. + // Continue scanning the chunk even after the first error so the full merkle tree can be built. + if (chunkHealthy && !expected.equals(actual)) { String message = String .format("Inconsistent read for chunk=%s" + " checksum item %d" + @@ -469,26 +474,31 @@ private static List verifyChecksum(BlockData block, StringUtils.bytes2Hex(expected.asReadOnlyByteBuffer()), StringUtils.bytes2Hex(actual.asReadOnlyByteBuffer()), block.getBlockID()); + chunkHealthy = false; scanErrors.add(new ContainerScanError(FailureType.CORRUPT_CHUNK, chunkFile, new OzoneChecksumException(message))); - chunkHealthy = false; } } // If all the checksums match, also check that the length stored in the metadata matches the number of bytes // seen on the disk. + observedChunkBuilder.setLen(bytesRead); if (chunkHealthy && bytesRead != chunk.getLen()) { String message = String .format("Inconsistent read for chunk=%s expected length=%d" + " actual length=%d for block %s", chunk.getChunkName(), chunk.getLen(), bytesRead, block.getBlockID()); + chunkHealthy = false; scanErrors.add(new ContainerScanError(FailureType.INCONSISTENT_CHUNK_LENGTH, chunkFile, new IOException(message))); } } catch (IOException ex) { + chunkHealthy = false; scanErrors.add(new ContainerScanError(FailureType.MISSING_CHUNK_FILE, chunkFile, ex)); } + observedChunkBuilder.setChecksumData(observedChecksums); + currentTree.addChunks(block.getBlockID().getLocalID(), chunkHealthy, observedChunkBuilder.build()); return scanErrors; } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index d587748e6f8..4babdd144f8 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -560,8 +560,9 @@ private void createContainerMerkleTree(Container container) { getBlockIterator(containerData.getContainerID())) { while (blockIterator.hasNext()) { BlockData blockData = blockIterator.nextBlock(); - List chunkInfos = blockData.getChunks(); - merkleTree.addChunks(blockData.getLocalID(), chunkInfos); + // All chunks are assumed to be healthy until the scanner inspects them to determine otherwise. + merkleTree.addChunks(blockData.getLocalID(), true, + blockData.getChunks().toArray(new ContainerProtos.ChunkInfo[0])); } } checksumManager.writeContainerDataTree(containerData, merkleTree); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java index 1a4f0bf6460..bf4d88626d8 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java @@ -103,17 +103,16 @@ public void scanContainer(Container c) metrics.incNumUnHealthyContainers(); } } - checksumManager.writeContainerDataTree(containerData, result.getDataTree()); metrics.incNumContainersScanned(); + checksumManager.writeContainerDataTree(containerData, result.getDataTree()); } - // Even if the container was deleted, mark the scan as completed since we already logged it as starting. Instant now = Instant.now(); - logScanCompleted(containerData, now); - if (!result.isDeleted()) { controller.updateDataScanTimestamp(containerId, now); } + // Even if the container was deleted, mark the scan as completed since we already logged it as starting. + logScanCompleted(containerData, now); } @Override diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java index eb0f3eedb03..427104d9a73 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java @@ -20,10 +20,10 @@ import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.hdfs.util.Canceler; import org.apache.hadoop.hdfs.util.DataTransferThrottler; +import org.apache.hadoop.ozone.container.checksum.ContainerChecksumTreeManager; import org.apache.hadoop.ozone.container.common.helpers.ContainerUtils; import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.interfaces.Container; -import org.apache.hadoop.ozone.container.common.interfaces.ScanResult; import org.apache.hadoop.ozone.container.common.volume.HddsVolume; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,9 +56,11 @@ public final class OnDemandContainerDataScanner { .KeySetView containerRescheduleCheckSet; private final OnDemandScannerMetrics metrics; private final long minScanGap; + private final ContainerChecksumTreeManager checksumManager; private OnDemandContainerDataScanner( - ContainerScannerConfiguration conf, ContainerController controller) { + ContainerScannerConfiguration conf, ContainerController controller, + ContainerChecksumTreeManager checksumManager) { containerController = controller; throttler = new DataTransferThrottler( conf.getOnDemandBandwidthPerVolume()); @@ -67,16 +69,17 @@ private OnDemandContainerDataScanner( scanExecutor = Executors.newSingleThreadExecutor(); containerRescheduleCheckSet = ConcurrentHashMap.newKeySet(); minScanGap = conf.getContainerScanMinGap(); + this.checksumManager = checksumManager; } - public static synchronized void init( - ContainerScannerConfiguration conf, ContainerController controller) { + public static synchronized void init(ContainerScannerConfiguration conf, ContainerController controller, + ContainerChecksumTreeManager checksumManager) { if (instance != null) { LOG.warn("Trying to initialize on demand scanner" + " a second time on a datanode."); return; } - instance = new OnDemandContainerDataScanner(conf, controller); + instance = new OnDemandContainerDataScanner(conf, controller, checksumManager); } private static boolean shouldScan(Container container) { @@ -133,7 +136,7 @@ private static void performOnDemandScan(Container container) { ContainerData containerData = container.getContainerData(); logScanStart(containerData); - ScanResult result = container.scanData(instance.throttler, instance.canceler); + DataScanResult result = container.scanData(instance.throttler, instance.canceler); // Metrics for skipped containers should not be updated. if (result.isDeleted()) { LOG.debug("Container [{}] has been deleted during the data scan.", containerId); @@ -146,17 +149,17 @@ private static void performOnDemandScan(Container container) { instance.metrics.incNumUnHealthyContainers(); } } - // TODO HDDS-10374 will need to update the merkle tree here as well. instance.metrics.incNumContainersScanned(); + instance.checksumManager.writeContainerDataTree(containerData, result.getDataTree()); } - // Even if the container was deleted, mark the scan as completed since we already logged it as starting. Instant now = Instant.now(); - logScanCompleted(containerData, now); - if (!result.isDeleted()) { instance.containerController.updateDataScanTimestamp(containerId, now); } + + // Even if the container was deleted, mark the scan as completed since we already logged it as starting. + logScanCompleted(containerData, now); } catch (IOException e) { LOG.warn("Unexpected exception while scanning container " + containerId, e); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java index 8ae838a7e53..66f3c5f39ca 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java @@ -407,7 +407,7 @@ private void initOnDemandContainerScanner(ContainerScannerConfiguration c) { "so the on-demand container data scanner will not start."); return; } - OnDemandContainerDataScanner.init(c, controller); + OnDemandContainerDataScanner.init(c, controller, checksumTreeManager); } /** diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java index db2a8c319b6..95aff8ab1e0 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java @@ -149,11 +149,10 @@ public static ContainerMerkleTree buildTestTree(ConfigurationSource conf, int nu ContainerMerkleTree tree = new ContainerMerkleTree(); byte byteValue = 1; for (int blockIndex = 1; blockIndex <= numBlocks; blockIndex++) { - List chunks = new ArrayList<>(); for (int chunkIndex = 0; chunkIndex < 4; chunkIndex++) { - chunks.add(buildChunk(conf, chunkIndex, ByteBuffer.wrap(new byte[]{byteValue++, byteValue++, byteValue++}))); + tree.addChunks(blockIndex, true, + buildChunk(conf, chunkIndex, ByteBuffer.wrap(new byte[]{byteValue++, byteValue++, byteValue++}))); } - tree.addChunks(blockIndex, chunks); } return tree; } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerMerkleTree.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerMerkleTree.java index 2f370c15b9f..b449a5003f9 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerMerkleTree.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerMerkleTree.java @@ -71,7 +71,7 @@ public void testBuildOneChunkTree() { // Use the ContainerMerkleTree to build the same tree. ContainerMerkleTree actualTree = new ContainerMerkleTree(); - actualTree.addChunks(blockID, Collections.singletonList(chunk)); + actualTree.addChunks(blockID, true, chunk); // Ensure the trees match. ContainerProtos.ContainerMerkleTree actualTreeProto = actualTree.toProto(); @@ -107,7 +107,7 @@ public void testBuildTreeWithMissingChunks() { // Use the ContainerMerkleTree to build the same tree. ContainerMerkleTree actualTree = new ContainerMerkleTree(); - actualTree.addChunks(blockID, Arrays.asList(chunk1, chunk3)); + actualTree.addChunks(blockID, true, chunk1, chunk3); // Ensure the trees match. ContainerProtos.ContainerMerkleTree actualTreeProto = actualTree.toProto(); @@ -138,8 +138,8 @@ public void testBuildTreeWithNonContiguousBlockIDs() { // Use the ContainerMerkleTree to build the same tree. // Add blocks and chunks out of order to test sorting. ContainerMerkleTree actualTree = new ContainerMerkleTree(); - actualTree.addChunks(blockID3, Arrays.asList(b3c2, b3c1)); - actualTree.addChunks(blockID1, Arrays.asList(b1c1, b1c2)); + actualTree.addChunks(blockID3, true, b3c2, b3c1); + actualTree.addChunks(blockID1, true, b1c1, b1c2); // Ensure the trees match. ContainerProtos.ContainerMerkleTree actualTreeProto = actualTree.toProto(); @@ -174,13 +174,13 @@ public void testAppendToBlocksWhileBuilding() throws Exception { // Test building by adding chunks to the blocks individually and out of order. ContainerMerkleTree actualTree = new ContainerMerkleTree(); // Add all of block 2 first. - actualTree.addChunks(blockID2, Arrays.asList(b2c1, b2c2)); + actualTree.addChunks(blockID2, true, b2c1, b2c2); // Then add block 1 in multiple steps wth chunks out of order. - actualTree.addChunks(blockID1, Collections.singletonList(b1c2)); - actualTree.addChunks(blockID1, Arrays.asList(b1c3, b1c1)); + actualTree.addChunks(blockID1, true, b1c2); + actualTree.addChunks(blockID1, true, b1c3, b1c1); // Add a duplicate chunk to block 3. It should overwrite the existing one. - actualTree.addChunks(blockID3, Arrays.asList(b3c1, b3c2)); - actualTree.addChunks(blockID3, Collections.singletonList(b3c2)); + actualTree.addChunks(blockID3, true, b3c1, b3c2); + actualTree.addChunks(blockID3, true, b3c2); // Ensure the trees match. ContainerProtos.ContainerMerkleTree actualTreeProto = actualTree.toProto(); From b0d1ba9b111d35c3612f2882d8c04d01bc630efa Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 25 Nov 2024 15:33:12 -0500 Subject: [PATCH 02/16] Update todo in acceptance test --- .../dist/src/main/smoketest/admincli/container.robot | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot index fae08991781..ec77b1a8566 100644 --- a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot +++ b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot @@ -121,8 +121,7 @@ Cannot reconcile open container # At this point we should have an open Ratis Three container. ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -n1 Execute and check rc ozone admin container reconcile "${container}" 255 - # The container should not yet have any replica checksums. - # TODO When the scanner is computing checksums automatically, this test may need to be updated. + # The container should not yet have any replica checksums since it is still open. ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 # 0 is the hex value of an empty checksum. Should Be Equal As Strings 0 ${data_checksum} @@ -137,9 +136,8 @@ Close container Wait until keyword succeeds 1min 10sec Container is closed ${container} Reconcile closed container - # Check that info does not show replica checksums, since manual reconciliation has not yet been triggered. - # TODO When the scanner is computing checksums automatically, this test may need to be updated. ${container} = Execute ozone admin container list --state CLOSED | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -1 + # TODO wait for container close to populate the checksum. ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 # 0 is the hex value of an empty checksum. Should Be Equal As Strings 0 ${data_checksum} From 382bce29afb32ddcdf8b938e8d2ae5bd49bca22e Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 25 Nov 2024 15:34:10 -0500 Subject: [PATCH 03/16] Add unit tests for tree generation by scanners based on container state --- .../TestBackgroundContainerDataScanner.java | 26 +++++++++-- .../TestOnDemandContainerDataScanner.java | 43 +++++++++++++++---- 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java index 681c5efc1af..576a0477be4 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.ozone.container.checksum.ContainerChecksumTreeManager; +import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.ozone.test.GenericTestUtils; import org.junit.jupiter.api.Test; @@ -32,6 +33,7 @@ import org.mockito.quality.Strictness; import java.time.Duration; +import java.util.Arrays; import java.util.Optional; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; @@ -48,7 +50,9 @@ import static org.mockito.Mockito.eq; import static org.mockito.Mockito.atLeastOnce; import static org.mockito.Mockito.atMostOnce; +import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -60,12 +64,13 @@ public class TestBackgroundContainerDataScanner extends TestContainerScannersAbstract { private BackgroundContainerDataScanner scanner; + private ContainerChecksumTreeManager mockChecksumManager; @BeforeEach public void setup() { super.setup(); - scanner = new BackgroundContainerDataScanner(conf, controller, vol, - new ContainerChecksumTreeManager(new OzoneConfiguration())); + mockChecksumManager = mock(ContainerChecksumTreeManager.class); + scanner = new BackgroundContainerDataScanner(conf, controller, vol, mockChecksumManager); } @Test @@ -224,7 +229,6 @@ public void testWithVolumeFailure() throws Exception { verify(openCorruptMetadata, never()).scanData(any(), any()); } - @Test @Override public void testShutdownDuringScan() throws Exception { @@ -244,6 +248,22 @@ public void testShutdownDuringScan() throws Exception { scanner.shutdown(); // The container should remain healthy. verifyContainerMarkedUnhealthy(healthy, never()); + } + + @Test + public void testMerkleTreeWritten() throws Exception { + scanner.runIteration(); + + // Merkle trees should not be written for open or deleted containers + for (Container container : Arrays.asList(openContainer, openCorruptMetadata, deletedContainer)) { + verify(mockChecksumManager, times(0)) + .writeContainerDataTree(eq(container.getContainerData()), any()); + } + // Merkle trees should be written for all other containers. + for (Container container : Arrays.asList(healthy, corruptData)) { + verify(mockChecksumManager, times(1)) + .writeContainerDataTree(eq(container.getContainerData()), any()); + } } } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java index 6f4b2efc137..502465c372d 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java @@ -23,6 +23,8 @@ import org.apache.hadoop.hdfs.util.Canceler; import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.ozone.container.checksum.ContainerChecksumTreeManager; +import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.interfaces.ScanResult; import org.junit.jupiter.api.AfterEach; @@ -34,6 +36,7 @@ import java.time.Duration; import java.util.ArrayList; +import java.util.Arrays; import java.util.Optional; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; @@ -52,7 +55,9 @@ import static org.mockito.Mockito.eq; import static org.mockito.Mockito.atLeastOnce; import static org.mockito.Mockito.atMostOnce; +import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.verifyNoInteractions; import static org.mockito.Mockito.when; @@ -64,9 +69,12 @@ public class TestOnDemandContainerDataScanner extends TestContainerScannersAbstract { + private ContainerChecksumTreeManager mockChecksumManager; + @BeforeEach public void setup() { super.setup(); + mockChecksumManager = mock(ContainerChecksumTreeManager .class); } @Test @@ -104,7 +112,7 @@ public void tearDown() { @Test public void testScanTimestampUpdated() throws Exception { - OnDemandContainerDataScanner.init(conf, controller); + OnDemandContainerDataScanner.init(conf, controller, mockChecksumManager); Optional> scanFuture = OnDemandContainerDataScanner.scanContainer(healthy); assertTrue(scanFuture.isPresent()); @@ -125,8 +133,8 @@ public void testScanTimestampUpdated() throws Exception { @Test public void testContainerScannerMultipleInitsAndShutdowns() throws Exception { - OnDemandContainerDataScanner.init(conf, controller); - OnDemandContainerDataScanner.init(conf, controller); + OnDemandContainerDataScanner.init(conf, controller, mockChecksumManager); + OnDemandContainerDataScanner.init(conf, controller, mockChecksumManager); OnDemandContainerDataScanner.shutdown(); OnDemandContainerDataScanner.shutdown(); //There shouldn't be an interaction after shutdown: @@ -136,7 +144,7 @@ public void testContainerScannerMultipleInitsAndShutdowns() throws Exception { @Test public void testSameContainerQueuedMultipleTimes() throws Exception { - OnDemandContainerDataScanner.init(conf, controller); + OnDemandContainerDataScanner.init(conf, controller, mockChecksumManager); //Given a container that has not finished scanning CountDownLatch latch = new CountDownLatch(1); when(corruptData.scanData( @@ -164,7 +172,7 @@ public void testSameContainerQueuedMultipleTimes() throws Exception { @Test @Override public void testScannerMetrics() throws Exception { - OnDemandContainerDataScanner.init(conf, controller); + OnDemandContainerDataScanner.init(conf, controller, mockChecksumManager); ArrayList>> resultFutureList = Lists.newArrayList(); resultFutureList.add(OnDemandContainerDataScanner.scanContainer( corruptData)); @@ -186,7 +194,7 @@ public void testScannerMetrics() throws Exception { @Test @Override public void testScannerMetricsUnregisters() { - OnDemandContainerDataScanner.init(conf, controller); + OnDemandContainerDataScanner.init(conf, controller, mockChecksumManager); String metricsName = OnDemandContainerDataScanner.getMetrics().getName(); assertNotNull(DefaultMetricsSystem.instance().getSource(metricsName)); OnDemandContainerDataScanner.shutdown(); @@ -226,7 +234,7 @@ public void testUnhealthyContainersDetected() throws Exception { public void testWithVolumeFailure() throws Exception { when(vol.isFailed()).thenReturn(true); - OnDemandContainerDataScanner.init(conf, controller); + OnDemandContainerDataScanner.init(conf, controller, mockChecksumManager); OnDemandScannerMetrics metrics = OnDemandContainerDataScanner.getMetrics(); scanContainer(healthy); @@ -251,7 +259,7 @@ public void testShutdownDuringScan() throws Exception { }); // Start the blocking scan. - OnDemandContainerDataScanner.init(conf, controller); + OnDemandContainerDataScanner.init(conf, controller, mockChecksumManager); OnDemandContainerDataScanner.scanContainer(healthy); // Shut down the on demand scanner. This will interrupt the blocked scan // on the healthy container. @@ -298,8 +306,25 @@ public void testUnhealthyContainerRescanned() throws Exception { assertEquals(1, metrics.getNumUnHealthyContainers()); } + @Test + public void testMerkleTreeWritten() throws Exception { + // Merkle trees should not be written for open or deleted containers + for (Container container : Arrays.asList(openContainer, openCorruptMetadata, deletedContainer)) { + scanContainer(container); + verify(mockChecksumManager, times(0)) + .writeContainerDataTree(eq(container.getContainerData()), any()); + } + + // Merkle trees should be written for all other containers. + for (Container container : Arrays.asList(healthy, corruptData)) { + scanContainer(container); + verify(mockChecksumManager, times(1)) + .writeContainerDataTree(eq(container.getContainerData()), any()); + } + } + private void scanContainer(Container container) throws Exception { - OnDemandContainerDataScanner.init(conf, controller); + OnDemandContainerDataScanner.init(conf, controller, mockChecksumManager); Optional> scanFuture = OnDemandContainerDataScanner.scanContainer(container); if (scanFuture.isPresent()) { From 28b18896665a3193f8d32cac9c7774d562fd5481 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 26 Nov 2024 11:46:20 -0500 Subject: [PATCH 04/16] Add initial (failing) unit test for KeyValueContaienrCheck Missing chunks are incorrectly reported as corrupted --- .../keyvalue/KeyValueContainerCheck.java | 10 +++-- .../keyvalue/TestKeyValueContainerCheck.java | 45 ++++++++++++++++--- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java index 5a9f1123ada..6e8225eecf9 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java @@ -421,9 +421,11 @@ private static List verifyChecksum(BlockData block, List scanErrors = new ArrayList<>(); - // Information used to populate the merkle tree. + // Information used to populate the merkle tree. Chunk metadata will be the same, but we must fill in the + // checksums with what we actually observe. ContainerProtos.ChunkInfo.Builder observedChunkBuilder = chunk.toBuilder(); - ContainerProtos.ChecksumData.Builder observedChecksums = chunk.getChecksumData().toBuilder(); + ContainerProtos.ChecksumData.Builder observedChecksumData = chunk.getChecksumData().toBuilder(); + observedChecksumData.clearChecksums(); boolean chunkHealthy = true; ChecksumData checksumData = @@ -458,7 +460,7 @@ private static List verifyChecksum(BlockData block, ByteString expected = checksumData.getChecksums().get(i); ByteString actual = cal.computeChecksum(buffer) .getChecksums().get(0); - observedChecksums.addChecksums(actual); + observedChecksumData.addChecksums(actual); // Only report one error per chunk. Reporting corruption at every "bytes per checksum" interval will lead to a // large amount of errors when a full chunk is corrupted. // Continue scanning the chunk even after the first error so the full merkle tree can be built. @@ -497,7 +499,7 @@ private static List verifyChecksum(BlockData block, scanErrors.add(new ContainerScanError(FailureType.MISSING_CHUNK_FILE, chunkFile, ex)); } - observedChunkBuilder.setChecksumData(observedChecksums); + observedChunkBuilder.setChecksumData(observedChecksumData); currentTree.addChunks(block.getBlockID().getLocalID(), chunkHealthy, observedChunkBuilder.build()); return scanErrors; } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java index b6da73b7ea6..c51994fe30c 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java @@ -24,6 +24,9 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.hdfs.util.Canceler; import org.apache.hadoop.hdfs.util.DataTransferThrottler; +import org.apache.hadoop.ozone.container.checksum.ContainerChecksumTreeManager; +import org.apache.hadoop.ozone.container.checksum.ContainerDiffReport; +import org.apache.hadoop.ozone.container.checksum.ContainerMerkleTree; import org.apache.hadoop.ozone.container.common.helpers.BlockData; import org.apache.hadoop.ozone.container.common.impl.ContainerLayoutVersion; import org.apache.hadoop.ozone.container.common.interfaces.BlockIterator; @@ -46,6 +49,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -156,13 +160,20 @@ public void testAllDataErrorsCollected(ContainerTestVersionInfo versionInfo) thr DataScanResult result = kvCheck.fullCheck(throttler, null); assertTrue(result.isHealthy()); + ContainerProtos.ContainerChecksumInfo healthyChecksumInfo = ContainerProtos.ContainerChecksumInfo.newBuilder() + .setContainerID(containerID) + .setContainerMerkleTree(result.getDataTree().toProto()) + .build(); // Put different types of block failures in the middle of the container. - CORRUPT_BLOCK.applyTo(container, 1); - MISSING_BLOCK.applyTo(container, 2); - TRUNCATED_BLOCK.applyTo(container, 4); + long corruptBlockID = 1; + long missingBlockID = 2; + long truncatedBlockID = 4; + CORRUPT_BLOCK.applyTo(container, corruptBlockID); + MISSING_BLOCK.applyTo(container, missingBlockID); + TRUNCATED_BLOCK.applyTo(container, truncatedBlockID); List expectedErrors = new ArrayList<>(); - // Corruption is applied to two different chunks within the block. + // Corruption is applied to two different chunks within the block, so the error will be raised twice. expectedErrors.add(CORRUPT_BLOCK.getExpectedResult()); expectedErrors.add(CORRUPT_BLOCK.getExpectedResult()); expectedErrors.add(MISSING_BLOCK.getExpectedResult()); @@ -177,12 +188,36 @@ public void testAllDataErrorsCollected(ContainerTestVersionInfo versionInfo) thr assertFalse(result.isHealthy()); // Check that all data errors were detected in order. - // TODO HDDS-10374 Use merkle tree to check the actual content affected by the errors. assertEquals(expectedErrors.size(), result.getErrors().size()); List actualErrors = result.getErrors().stream() .map(ContainerScanError::getFailureType) .collect(Collectors.toList()); assertEquals(expectedErrors, actualErrors); + + // Write the new tree into the container, as the scanner would do. + ContainerChecksumTreeManager checksumManager = new ContainerChecksumTreeManager(conf); + checksumManager.writeContainerDataTree(container.getContainerData(), result.getDataTree()); + // This will read the corrupted tree from the disk, which represents the current state of the container, and + // compare it against the original healthy tree. The diff we get back should match the failures we injected. + ContainerDiffReport diffReport = checksumManager.diff(container.getContainerData(), healthyChecksumInfo); + + // Check that the new tree identified all the expected errors by checking the diff. + Map> corruptChunks = diffReport.getCorruptChunks(); + // One block had corrupted chunks. + assertEquals(1, corruptChunks.size()); + List corruptChunksInBlock = corruptChunks.get(corruptBlockID); + assertEquals(2, corruptChunksInBlock.size()); + + // Check missing block was correctly identified in the tree diff. + List missingBlocks = diffReport.getMissingBlocks(); + assertEquals(1, missingBlocks.size()); + assertEquals(missingBlockID, missingBlocks.get(0).getBlockID()); + + // One block was truncated which resulted in all of its chunks being reported as missing. + Map> missingChunks = diffReport.getMissingChunks(); + assertEquals(1, missingChunks.size()); + List missingChunksInBlock = missingChunks.get(truncatedBlockID); + assertEquals(CHUNKS_PER_BLOCK, missingChunksInBlock.size()); } /** From dc182e85ce7bdf90186ca635ecd1f20ce872cd60 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Tue, 26 Nov 2024 12:45:20 -0500 Subject: [PATCH 05/16] Update container data checksum when building the tree --- .../checksum/ContainerChecksumTreeManager.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java index 2c10313c2fc..9a739668eaa 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java @@ -82,6 +82,8 @@ public void stop() { * The data merkle tree within the file is replaced with the {@code tree} parameter, but all other content of the * file remains unchanged. * Concurrent writes to the same file are coordinated internally. + * This method also updates the container's data checksum in the {@code data} parameter, which will be seen by SCM + * on container reports. */ public void writeContainerDataTree(ContainerData data, ContainerMerkleTree tree) throws IOException { long containerID = data.getContainerID(); @@ -99,11 +101,15 @@ public void writeContainerDataTree(ContainerData data, ContainerMerkleTree tree) checksumInfoBuilder = ContainerProtos.ContainerChecksumInfo.newBuilder(); } + ContainerProtos.ContainerMerkleTree treeProto = captureLatencyNs(metrics.getCreateMerkleTreeLatencyNS(), + tree::toProto); + long dataChecksum = treeProto.getDataChecksum(); + data.setDataChecksum(dataChecksum); checksumInfoBuilder .setContainerID(containerID) - .setContainerMerkleTree(captureLatencyNs(metrics.getCreateMerkleTreeLatencyNS(), tree::toProto)); + .setContainerMerkleTree(treeProto); write(data, checksumInfoBuilder.build()); - LOG.debug("Data merkle tree for container {} updated", containerID); + LOG.debug("Data merkle tree for container {} updated with container checksum {}", containerID, dataChecksum); } finally { writeLock.unlock(); } From a3401a93835e59193feedf0f3528dd9cd398c1a5 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 6 Jan 2025 19:41:29 -0500 Subject: [PATCH 06/16] Fix handling of fully truncated block of 0 size --- .../checksum/ContainerMerkleTree.java | 10 +++++ .../keyvalue/KeyValueContainerCheck.java | 45 +++++++++++++------ .../ozoneimpl/ContainerScanError.java | 3 +- .../keyvalue/TestContainerCorruptions.java | 4 +- .../keyvalue/TestKeyValueContainerCheck.java | 15 ++++--- 5 files changed, 55 insertions(+), 22 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java index 56578dc9f1e..651f6fbb0a7 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java @@ -64,6 +64,16 @@ public void addChunks(long blockID, boolean healthy, ContainerProtos.ChunkInfo.. id2Block.computeIfAbsent(blockID, BlockMerkleTree::new).addChunks(healthy, chunks); } + /** + * Adds an empty block to the tree. This method is not a pre-requisite to {@code addChunks}. + * If the block entry already exists, it will not be modified. + * + * @param blockID The ID of the empty block to add to the tree + */ + public void addBlock(long blockID) { + addChunks(blockID, true); + } + /** * Uses chunk hashes to compute all remaining hashes in the tree, and returns it as a protobuf object. No checksum * computation for the tree happens outside of this method. diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java index 6e8225eecf9..3b4e507e198 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainerCheck.java @@ -378,12 +378,13 @@ private List scanBlock(DBHandle db, File dbFile, BlockData b // So, we need to make sure, chunk length > 0, before declaring // the missing chunk file. if (!block.getChunks().isEmpty() && block.getChunks().get(0).getLen() > 0) { - ContainerScanError error = new ContainerScanError(FailureType.MISSING_CHUNK_FILE, + ContainerScanError error = new ContainerScanError(FailureType.MISSING_DATA_FILE, new File(containerDataFromDisk.getChunksPath()), new IOException("Missing chunk file " + chunkFile.getAbsolutePath())); blockErrors.add(error); } } else if (chunk.getChecksumData().getType() != ContainerProtos.ChecksumType.NONE) { + currentTree.addBlock(block.getBlockID().getLocalID()); int bytesPerChecksum = chunk.getChecksumData().getBytesPerChecksum(); ByteBuffer buffer = BUFFER_POOL.getBuffer(bytesPerChecksum); // Keep scanning the block even if there are errors with individual chunks. @@ -427,6 +428,7 @@ private static List verifyChecksum(BlockData block, ContainerProtos.ChecksumData.Builder observedChecksumData = chunk.getChecksumData().toBuilder(); observedChecksumData.clearChecksums(); boolean chunkHealthy = true; + boolean chunkMissing = false; ChecksumData checksumData = ChecksumData.getFromProtoBuf(chunk.getChecksumData()); @@ -481,26 +483,41 @@ private static List verifyChecksum(BlockData block, new OzoneChecksumException(message))); } } - // If all the checksums match, also check that the length stored in the metadata matches the number of bytes - // seen on the disk. + observedChunkBuilder.setLen(bytesRead); + // If we haven't seen any errors after scanning the whole chunk, verify that the length stored in the metadata + // matches the number of bytes seen on the disk. if (chunkHealthy && bytesRead != chunk.getLen()) { - String message = String - .format("Inconsistent read for chunk=%s expected length=%d" - + " actual length=%d for block %s", - chunk.getChunkName(), - chunk.getLen(), bytesRead, block.getBlockID()); - chunkHealthy = false; - scanErrors.add(new ContainerScanError(FailureType.INCONSISTENT_CHUNK_LENGTH, chunkFile, - new IOException(message))); + if (bytesRead == 0) { + // If we could not find any data for the chunk, report it as missing. + chunkMissing = true; + chunkHealthy = false; + String message = String.format("Missing chunk=%s with expected length=%d for block %s", + chunk.getChunkName(), chunk.getLen(), block.getBlockID()); + scanErrors.add(new ContainerScanError(FailureType.MISSING_CHUNK, chunkFile, new IOException(message))); + } else { + // We found data for the chunk, but it was shorter than expected. + String message = String + .format("Inconsistent read for chunk=%s expected length=%d" + + " actual length=%d for block %s", + chunk.getChunkName(), + chunk.getLen(), bytesRead, block.getBlockID()); + chunkHealthy = false; + scanErrors.add(new ContainerScanError(FailureType.INCONSISTENT_CHUNK_LENGTH, chunkFile, + new IOException(message))); + } } } catch (IOException ex) { + // An unknown error occurred trying to access the chunk. Report it as corrupted. chunkHealthy = false; - scanErrors.add(new ContainerScanError(FailureType.MISSING_CHUNK_FILE, chunkFile, ex)); + scanErrors.add(new ContainerScanError(FailureType.CORRUPT_CHUNK, chunkFile, ex)); } - observedChunkBuilder.setChecksumData(observedChecksumData); - currentTree.addChunks(block.getBlockID().getLocalID(), chunkHealthy, observedChunkBuilder.build()); + // Missing chunks should not be added to the merkle tree. + if (!chunkMissing) { + observedChunkBuilder.setChecksumData(observedChecksumData); + currentTree.addChunks(block.getBlockID().getLocalID(), chunkHealthy, observedChunkBuilder.build()); + } return scanErrors; } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanError.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanError.java index 8fbd8f6887e..2c4648e22fd 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanError.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerScanError.java @@ -32,9 +32,10 @@ public enum FailureType { MISSING_METADATA_DIR, MISSING_CONTAINER_FILE, MISSING_CHUNKS_DIR, - MISSING_CHUNK_FILE, + MISSING_DATA_FILE, CORRUPT_CONTAINER_FILE, CORRUPT_CHUNK, + MISSING_CHUNK, INCONSISTENT_CHUNK_LENGTH, INACCESSIBLE_DB, WRITE_FAILURE, diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestContainerCorruptions.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestContainerCorruptions.java index 470197e1f82..c61f8906b92 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestContainerCorruptions.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestContainerCorruptions.java @@ -93,7 +93,7 @@ public enum TestContainerCorruptions { MISSING_BLOCK((container, blockID) -> { File blockFile = getBlock(container, blockID); assertTrue(blockFile.delete()); - }, ContainerScanError.FailureType.MISSING_CHUNK_FILE), + }, ContainerScanError.FailureType.MISSING_DATA_FILE), CORRUPT_CONTAINER_FILE((container, blockID) -> { File containerFile = container.getContainerFile(); @@ -113,7 +113,7 @@ public enum TestContainerCorruptions { TRUNCATED_BLOCK((container, blockID) -> { File blockFile = getBlock(container, blockID); truncateFile(blockFile); - }, ContainerScanError.FailureType.INCONSISTENT_CHUNK_LENGTH); + }, ContainerScanError.FailureType.MISSING_CHUNK); private final BiConsumer, Long> corruption; private final ContainerScanError.FailureType expectedResult; diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java index c51994fe30c..e4d0206bd5d 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java @@ -160,6 +160,8 @@ public void testAllDataErrorsCollected(ContainerTestVersionInfo versionInfo) thr DataScanResult result = kvCheck.fullCheck(throttler, null); assertTrue(result.isHealthy()); + // The scanner would write the checksum file to disk. `KeyValueContainerCheck` does not, so we will create the + // result here. ContainerProtos.ContainerChecksumInfo healthyChecksumInfo = ContainerProtos.ContainerChecksumInfo.newBuilder() .setContainerID(containerID) .setContainerMerkleTree(result.getDataTree().toProto()) @@ -201,6 +203,8 @@ public void testAllDataErrorsCollected(ContainerTestVersionInfo versionInfo) thr // compare it against the original healthy tree. The diff we get back should match the failures we injected. ContainerDiffReport diffReport = checksumManager.diff(container.getContainerData(), healthyChecksumInfo); + LOG.info("Diff of healthy container with actual container {}", diffReport); + // Check that the new tree identified all the expected errors by checking the diff. Map> corruptChunks = diffReport.getCorruptChunks(); // One block had corrupted chunks. @@ -208,16 +212,17 @@ public void testAllDataErrorsCollected(ContainerTestVersionInfo versionInfo) thr List corruptChunksInBlock = corruptChunks.get(corruptBlockID); assertEquals(2, corruptChunksInBlock.size()); - // Check missing block was correctly identified in the tree diff. - List missingBlocks = diffReport.getMissingBlocks(); - assertEquals(1, missingBlocks.size()); - assertEquals(missingBlockID, missingBlocks.get(0).getBlockID()); - // One block was truncated which resulted in all of its chunks being reported as missing. Map> missingChunks = diffReport.getMissingChunks(); assertEquals(1, missingChunks.size()); List missingChunksInBlock = missingChunks.get(truncatedBlockID); assertEquals(CHUNKS_PER_BLOCK, missingChunksInBlock.size()); + + // Check missing block was correctly identified in the tree diff. + List missingBlocks = diffReport.getMissingBlocks(); + assertEquals(1, missingBlocks.size()); + assertEquals(missingBlockID, missingBlocks.get(0).getBlockID()); + } /** From a25d44d221781b7e3cee30fa31f464a1e9c4d793 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 6 Jan 2025 20:03:25 -0500 Subject: [PATCH 07/16] Add unit tests for new addBlock method in tree --- .../checksum/TestContainerMerkleTree.java | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerMerkleTree.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerMerkleTree.java index b449a5003f9..2905ea7d648 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerMerkleTree.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/TestContainerMerkleTree.java @@ -114,6 +114,42 @@ public void testBuildTreeWithMissingChunks() { assertTreesSortedAndMatch(expectedTree, actualTreeProto); } + @Test + public void testBuildTreeWithEmptyBlock() { + final long blockID = 1; + ContainerProtos.BlockMerkleTree blockTree = buildExpectedBlockTree(blockID, Collections.emptyList()); + ContainerProtos.ContainerMerkleTree expectedTree = buildExpectedContainerTree(Collections.singletonList(blockTree)); + + // Use the ContainerMerkleTree to build the same tree. + ContainerMerkleTree actualTree = new ContainerMerkleTree(); + actualTree.addBlock(blockID); + + // Ensure the trees match. + ContainerProtos.ContainerMerkleTree actualTreeProto = actualTree.toProto(); + assertTreesSortedAndMatch(expectedTree, actualTreeProto); + } + + @Test + public void testAddBlockIdempotent() { + final long blockID = 1; + // Build the expected proto. + ContainerProtos.ChunkInfo chunk1 = buildChunk(config, 0, ByteBuffer.wrap(new byte[]{1, 2, 3})); + ContainerProtos.BlockMerkleTree blockTree = buildExpectedBlockTree(blockID, + Collections.singletonList(buildExpectedChunkTree(chunk1))); + ContainerProtos.ContainerMerkleTree expectedTree = buildExpectedContainerTree(Collections.singletonList(blockTree)); + + // Use the ContainerMerkleTree to build the same tree, calling addBlock in between adding chunks. + ContainerMerkleTree actualTree = new ContainerMerkleTree(); + actualTree.addBlock(blockID); + actualTree.addChunks(blockID, true, chunk1); + // This should not overwrite the chunk already added to the block. + actualTree.addBlock(blockID); + + // Ensure the trees match. + ContainerProtos.ContainerMerkleTree actualTreeProto = actualTree.toProto(); + assertTreesSortedAndMatch(expectedTree, actualTreeProto); + } + /** * A container is a set of blocks. Make sure the tree implementation is not dependent on continuity of block IDs. */ From 7550a3c85d28fd49a3fb746e0962fe67fec9f6be Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 6 Jan 2025 20:22:51 -0500 Subject: [PATCH 08/16] Test that SCM gets a checksum with the container report --- ...stBackgroundContainerDataScannerIntegration.java | 5 +++++ .../TestContainerScannerIntegrationAbstract.java | 13 ++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java index 80dbda64bfc..de6818cfda3 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java @@ -35,6 +35,7 @@ import java.util.concurrent.TimeUnit; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; /** * Integration tests for the background container data scanner. This scanner @@ -95,6 +96,10 @@ void testCorruptionDetected(TestContainerCorruptions corruption) corruption == TestContainerCorruptions.CORRUPT_BLOCK) { // These errors will affect multiple chunks and result in multiple log messages. corruption.assertLogged(containerID, logCapturer); + // Check a corresponding checksum reported to SCM. This would be zero for metadata errors. + // TODO HDDS-11942 This check can be made generic for all faults because every fault provided to the test will + // declare its expected checksum. + assertNotEquals(0, getContainerReplica(containerID).getDataChecksum()); } else { // Other corruption types will only lead to a single error. corruption.assertLogged(containerID, 1, logCapturer); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java index 5f6c14bcde7..cfaaa32fbca 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java @@ -121,11 +121,8 @@ static void shutdown() throws IOException { protected void waitForScmToSeeUnhealthyReplica(long containerID) throws Exception { - ContainerManager scmContainerManager = cluster.getStorageContainerManager() - .getContainerManager(); LambdaTestUtils.await(5000, 500, - () -> getContainerReplica(scmContainerManager, containerID) - .getState() == State.UNHEALTHY); + () -> getContainerReplica(containerID).getState() == State.UNHEALTHY); } protected void waitForScmToCloseContainer(long containerID) throws Exception { @@ -188,11 +185,9 @@ protected byte[] getTestData() { .getBytes(UTF_8); } - protected ContainerReplica getContainerReplica( - ContainerManager cm, long containerId) throws ContainerNotFoundException { - Set containerReplicas = cm.getContainerReplicas( - ContainerID.valueOf( - containerId)); + protected ContainerReplica getContainerReplica(long containerId) throws ContainerNotFoundException { + ContainerManager cm = cluster.getStorageContainerManager().getContainerManager(); + Set containerReplicas = cm.getContainerReplicas(ContainerID.valueOf(containerId)); // Only using a single datanode cluster. assertEquals(1, containerReplicas.size()); return containerReplicas.iterator().next(); From 847f8d86abd318ea3354bc04d7b6c32b258eb380 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Mon, 6 Jan 2025 21:08:24 -0500 Subject: [PATCH 09/16] Add (failing) tests that SCM sees updated checksums --- .../ContainerChecksumTreeManager.java | 2 +- .../ContainerMerkleTreeTestUtils.java | 5 ++--- .../hadoop/hdds/scm/TestCloseContainer.java | 16 +++++++-------- ...groundContainerDataScannerIntegration.java | 20 +++++++++++++------ ...ndContainerMetadataScannerIntegration.java | 16 +++++++++++++-- ...stContainerScannerIntegrationAbstract.java | 12 +++++++++-- ...DemandContainerDataScannerIntegration.java | 11 +++++++++- 7 files changed, 59 insertions(+), 23 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java index 9a739668eaa..036208efeb2 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java @@ -390,7 +390,7 @@ public ContainerMerkleTreeMetrics getMetrics() { return this.metrics; } - public static boolean checksumFileExist(Container container) { + public static boolean checksumFileExist(Container container) { File checksumFile = getContainerChecksumFile(container.getContainerData()); return checksumFile.exists(); } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java index 95aff8ab1e0..140fff1517b 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java @@ -333,10 +333,9 @@ private static void assertEqualsChunkMerkleTree(List container = ozoneContainer.getController().getContainer(containerID); return ContainerChecksumTreeManager.checksumFileExist(container); } diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestCloseContainer.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestCloseContainer.java index 35434500391..96a60429a23 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestCloseContainer.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestCloseContainer.java @@ -133,14 +133,14 @@ public void testReplicasAreReportedForClosedContainerAfterRestart() // Checksum file doesn't exist before container close List hddsDatanodes = cluster.getHddsDatanodes(); for (HddsDatanodeService hddsDatanode: hddsDatanodes) { - assertFalse(containerChecksumFileExists(hddsDatanode, container)); + assertFalse(containerChecksumFileExists(hddsDatanode, container.getContainerID())); } OzoneTestUtils.closeContainer(scm, container); // Checksum file exists after container close for (HddsDatanodeService hddsDatanode: hddsDatanodes) { GenericTestUtils.waitFor(() -> checkContainerCloseInDatanode(hddsDatanode, container), 100, 5000); - assertTrue(containerChecksumFileExists(hddsDatanode, container)); + assertTrue(containerChecksumFileExists(hddsDatanode, container.getContainerID())); } long originalSeq = container.getSequenceId(); @@ -187,7 +187,7 @@ public void testCloseClosedContainer() // Checksum file doesn't exist before container close List hddsDatanodes = cluster.getHddsDatanodes(); for (HddsDatanodeService hddsDatanode: hddsDatanodes) { - assertFalse(containerChecksumFileExists(hddsDatanode, container)); + assertFalse(containerChecksumFileExists(hddsDatanode, container.getContainerID())); } // Close container OzoneTestUtils.closeContainer(scm, container); @@ -195,7 +195,7 @@ public void testCloseClosedContainer() // Checksum file exists after container close for (HddsDatanodeService hddsDatanode: hddsDatanodes) { GenericTestUtils.waitFor(() -> checkContainerCloseInDatanode(hddsDatanode, container), 100, 5000); - assertTrue(containerChecksumFileExists(hddsDatanode, container)); + assertTrue(containerChecksumFileExists(hddsDatanode, container.getContainerID())); } assertThrows(IOException.class, @@ -215,7 +215,7 @@ public void testContainerChecksumForClosedContainer() throws Exception { // Checksum file doesn't exist before container close List hddsDatanodes = cluster.getHddsDatanodes(); for (HddsDatanodeService hddsDatanode : hddsDatanodes) { - assertFalse(containerChecksumFileExists(hddsDatanode, containerInfo1)); + assertFalse(containerChecksumFileExists(hddsDatanode, containerInfo1.getContainerID())); } // Close container. OzoneTestUtils.closeContainer(scm, containerInfo1); @@ -224,7 +224,7 @@ public void testContainerChecksumForClosedContainer() throws Exception { // merkle tree for all the datanodes for (HddsDatanodeService hddsDatanode : hddsDatanodes) { GenericTestUtils.waitFor(() -> checkContainerCloseInDatanode(hddsDatanode, containerInfo1), 100, 5000); - assertTrue(containerChecksumFileExists(hddsDatanode, containerInfo1)); + assertTrue(containerChecksumFileExists(hddsDatanode, containerInfo1.getContainerID())); OzoneContainer ozoneContainer = hddsDatanode.getDatanodeStateMachine().getContainer(); Container container1 = ozoneContainer.getController().getContainer(containerInfo1.getContainerID()); ContainerProtos.ContainerChecksumInfo containerChecksumInfo = ContainerMerkleTreeTestUtils.readChecksumFile( @@ -242,7 +242,7 @@ public void testContainerChecksumForClosedContainer() throws Exception { ReplicationType.RATIS, "this is the different content"); ContainerInfo containerInfo2 = scm.getContainerManager().getContainers().get(1); for (HddsDatanodeService hddsDatanode : hddsDatanodes) { - assertFalse(containerChecksumFileExists(hddsDatanode, containerInfo2)); + assertFalse(containerChecksumFileExists(hddsDatanode, containerInfo2.getContainerID())); } // Close container. @@ -252,7 +252,7 @@ public void testContainerChecksumForClosedContainer() throws Exception { // merkle tree for all the datanodes for (HddsDatanodeService hddsDatanode : hddsDatanodes) { GenericTestUtils.waitFor(() -> checkContainerCloseInDatanode(hddsDatanode, containerInfo2), 100, 5000); - assertTrue(containerChecksumFileExists(hddsDatanode, containerInfo2)); + assertTrue(containerChecksumFileExists(hddsDatanode, containerInfo2.getContainerID())); OzoneContainer ozoneContainer = hddsDatanode.getDatanodeStateMachine().getContainer(); Container container2 = ozoneContainer.getController().getContainer(containerInfo2.getContainerID()); ContainerProtos.ContainerChecksumInfo containerChecksumInfo = ContainerMerkleTreeTestUtils.readChecksumFile( diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java index de6818cfda3..b4910278257 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java @@ -20,7 +20,10 @@ package org.apache.hadoop.ozone.dn.scanner; import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; +import org.apache.hadoop.ozone.container.checksum.ContainerMerkleTreeTestUtils; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.utils.ContainerLogger; import org.apache.hadoop.ozone.container.keyvalue.TestContainerCorruptions; @@ -34,8 +37,11 @@ import java.util.concurrent.TimeUnit; +import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.CLOSED; +import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.UNHEALTHY; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Integration tests for the background container data scanner. This scanner @@ -79,6 +85,10 @@ void testCorruptionDetected(TestContainerCorruptions corruption) // Container corruption has not yet been introduced. Container container = getDnContainer(containerID); assertEquals(State.CLOSED, container.getContainerState()); + assertTrue(containerChecksumFileExists(containerID)); + + waitForScmToSeeReplicaState(containerID, CLOSED); + long initialReportedDataChecksum = getContainerReplica(containerID).getDataChecksum(); corruption.applyTo(container); @@ -89,17 +99,15 @@ void testCorruptionDetected(TestContainerCorruptions corruption) () -> container.getContainerState() == State.UNHEALTHY, 500, 15_000); - // Wait for SCM to get a report of the unhealthy replica. - waitForScmToSeeUnhealthyReplica(containerID); + // Wait for SCM to get a report of the unhealthy replica with a different checksum than before. + waitForScmToSeeReplicaState(containerID, UNHEALTHY); + long newReportedDataChecksum = getContainerReplica(containerID).getDataChecksum(); + assertNotEquals(initialReportedDataChecksum, newReportedDataChecksum); if (corruption == TestContainerCorruptions.TRUNCATED_BLOCK || corruption == TestContainerCorruptions.CORRUPT_BLOCK) { // These errors will affect multiple chunks and result in multiple log messages. corruption.assertLogged(containerID, logCapturer); - // Check a corresponding checksum reported to SCM. This would be zero for metadata errors. - // TODO HDDS-11942 This check can be made generic for all faults because every fault provided to the test will - // declare its expected checksum. - assertNotEquals(0, getContainerReplica(containerID).getDataChecksum()); } else { // Other corruption types will only lead to a single error. corruption.assertLogged(containerID, 1, logCapturer); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerMetadataScannerIntegration.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerMetadataScannerIntegration.java index 0678190d47c..9c21aa5836c 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerMetadataScannerIntegration.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerMetadataScannerIntegration.java @@ -36,7 +36,11 @@ import java.util.Collection; import java.util.concurrent.TimeUnit; +import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.CLOSED; +import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.UNHEALTHY; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Integration tests for the background container metadata scanner. This @@ -92,10 +96,16 @@ void testCorruptionDetected(TestContainerCorruptions corruption) long closedContainerID = writeDataThenCloseContainer(); Container closedContainer = getDnContainer(closedContainerID); assertEquals(State.CLOSED, closedContainer.getContainerState()); + assertTrue(containerChecksumFileExists(closedContainerID)); + waitForScmToSeeReplicaState(closedContainerID, CLOSED); + long initialReportedClosedChecksum = getContainerReplica(closedContainerID).getDataChecksum(); long openContainerID = writeDataToOpenContainer(); Container openContainer = getDnContainer(openContainerID); assertEquals(State.OPEN, openContainer.getContainerState()); + long initialReportedOpenChecksum = getContainerReplica(openContainerID).getDataChecksum(); + // Open containers should not yet have a checksum generated. + assertEquals(0, initialReportedOpenChecksum); // Corrupt both containers. corruption.applyTo(closedContainer); @@ -110,8 +120,10 @@ void testCorruptionDetected(TestContainerCorruptions corruption) 500, 5000); // Wait for SCM to get reports of the unhealthy replicas. - waitForScmToSeeUnhealthyReplica(closedContainerID); - waitForScmToSeeUnhealthyReplica(openContainerID); + waitForScmToSeeReplicaState(closedContainerID, UNHEALTHY); + assertNotEquals(initialReportedClosedChecksum, getContainerReplica(closedContainerID).getDataChecksum()); + waitForScmToSeeReplicaState(openContainerID, UNHEALTHY); + assertNotEquals(initialReportedOpenChecksum, getContainerReplica(openContainerID).getDataChecksum()); // Once the unhealthy replica is reported, the open container's lifecycle // state in SCM should move to closed. diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java index cfaaa32fbca..c2befa60d74 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java @@ -38,6 +38,7 @@ import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.container.ContainerTestHelper; import org.apache.hadoop.ozone.container.TestHelper; +import org.apache.hadoop.ozone.container.checksum.ContainerMerkleTreeTestUtils; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.ozoneimpl.ContainerScannerConfiguration; import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; @@ -59,6 +60,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * This class tests the data scanner functionality. @@ -119,10 +121,10 @@ static void shutdown() throws IOException { } } - protected void waitForScmToSeeUnhealthyReplica(long containerID) + protected void waitForScmToSeeReplicaState(long containerID, State state) throws Exception { LambdaTestUtils.await(5000, 500, - () -> getContainerReplica(containerID).getState() == State.UNHEALTHY); + () -> getContainerReplica(containerID).getState() == state); } protected void waitForScmToCloseContainer(long containerID) throws Exception { @@ -143,6 +145,12 @@ protected Container getDnContainer(long containerID) { return getOzoneContainer().getContainerSet().getContainer(containerID); } + protected boolean containerChecksumFileExists(long containerID) { + assertEquals(1, cluster.getHddsDatanodes().size()); + HddsDatanodeService dn = cluster.getHddsDatanodes().get(0); + return ContainerMerkleTreeTestUtils.containerChecksumFileExists(dn, containerID); + } + protected long writeDataThenCloseContainer() throws Exception { return writeDataThenCloseContainer("keyName"); } diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java index af94506c827..36a92e87045 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java @@ -21,6 +21,7 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.utils.ContainerLogger; import org.apache.hadoop.ozone.container.keyvalue.TestContainerCorruptions; @@ -33,7 +34,10 @@ import java.util.Collection; +import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.UNHEALTHY; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Integration tests for the on demand container data scanner. This scanner @@ -98,6 +102,9 @@ void testCorruptionDetected(TestContainerCorruptions corruption) // Container corruption has not yet been introduced. Container container = getDnContainer(containerID); assertEquals(State.CLOSED, container.getContainerState()); + long initialReportedDataChecksum = getContainerReplica(containerID).getDataChecksum(); + assertTrue(containerChecksumFileExists(containerID)); + // Corrupt the container. corruption.applyTo(container); // This method will check that reading from the corrupted key returns an @@ -110,7 +117,9 @@ void testCorruptionDetected(TestContainerCorruptions corruption) 500, 5000); // Wait for SCM to get a report of the unhealthy replica. - waitForScmToSeeUnhealthyReplica(containerID); + waitForScmToSeeReplicaState(containerID, UNHEALTHY); corruption.assertLogged(containerID, 1, logCapturer); + long newReportedDataChecksum = getContainerReplica(containerID).getDataChecksum(); + assertNotEquals(initialReportedDataChecksum, newReportedDataChecksum); } } From 452c2940559a92c5d01b8760b9603c116630f676 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 8 Jan 2025 14:16:47 -0500 Subject: [PATCH 10/16] Update acceptance test --- .../main/smoketest/admincli/container.robot | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot index ec77b1a8566..307fb243195 100644 --- a/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot +++ b/hadoop-ozone/dist/src/main/smoketest/admincli/container.robot @@ -34,11 +34,14 @@ Container is closed ${output} = Execute ozone admin container info "${container}" Should contain ${output} CLOSED -Reconciliation complete - [arguments] ${container} - ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 - Should not be empty ${data_checksum} - Should not be equal as strings 0 ${data_checksum} +Container checksums should match + [arguments] ${container} ${expected_checksum} + ${data_checksum1} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[0].dataChecksum' | head -n1 + ${data_checksum2} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[1].dataChecksum' | head -n1 + ${data_checksum3} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[2].dataChecksum' | head -n1 + Should be equal as strings ${data_checksum1} ${expected_checksum} + Should be equal as strings ${data_checksum2} ${expected_checksum} + Should be equal as strings ${data_checksum3} ${expected_checksum} *** Test Cases *** Create container @@ -122,9 +125,8 @@ Cannot reconcile open container ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -n1 Execute and check rc ozone admin container reconcile "${container}" 255 # The container should not yet have any replica checksums since it is still open. - ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 # 0 is the hex value of an empty checksum. - Should Be Equal As Strings 0 ${data_checksum} + Container checksums should match ${container} 0 Close container ${container} = Execute ozone admin container list --state OPEN | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -1 @@ -137,10 +139,10 @@ Close container Reconcile closed container ${container} = Execute ozone admin container list --state CLOSED | jq -r 'select(.replicationConfig.replicationFactor == "THREE") | .containerID' | head -1 - # TODO wait for container close to populate the checksum. ${data_checksum} = Execute ozone admin container info "${container}" --json | jq -r '.replicas[].dataChecksum' | head -n1 - # 0 is the hex value of an empty checksum. - Should Be Equal As Strings 0 ${data_checksum} - # When reconciliation finishes, replica checksums should be shown. + # Once the container is closed, the data checksum should be populated + Should Not Be Equal As Strings 0 ${data_checksum} + Container checksums should match ${container} ${data_checksum} + # Check that reconcile CLI returns success. Without fault injection, there is no change expected to the + # container's checksums to inidcate it made a difference Execute ozone admin container reconcile ${container} - Wait until keyword succeeds 1min 5sec Reconciliation complete ${container} From dc45eca6f871a48f9c24061d7fdbfea8dc09b3a9 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 8 Jan 2025 14:22:49 -0500 Subject: [PATCH 11/16] Add javadoc for tree generation from metadata --- .../hadoop/ozone/container/keyvalue/KeyValueHandler.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index 4babdd144f8..45813747737 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -547,6 +547,15 @@ ContainerCommandResponseProto handleCloseContainer( return getSuccessResponse(request); } + /** + * Write the merkle tree for this container using the existing checksum metadata only. The data is not read or + * validated by this method, so it is expected to run quickly. + * + * If a checksum file already exists on the disk, this method will do nothing. The existing file would have either + * been made from the metadata or data itself so there is no need to recreate it from the metadata. + * + * @param container The container which will have a tree generated. + */ private void createContainerMerkleTree(Container container) { if (ContainerChecksumTreeManager.checksumFileExist(container)) { return; From 1cb291f2e0e76e327f0920a018b773e89b49301e Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 8 Jan 2025 15:49:37 -0500 Subject: [PATCH 12/16] Data integration tests passing --- .../checksum/ContainerChecksumTreeManager.java | 9 +++++++-- .../ozoneimpl/BackgroundContainerDataScanner.java | 8 +++++++- .../ozoneimpl/OnDemandContainerDataScanner.java | 7 ++++++- ...ackgroundContainerMetadataScannerIntegration.java | 12 +++++++----- .../TestOnDemandContainerDataScannerIntegration.java | 5 ++++- 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java index 036208efeb2..298742eda4d 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerChecksumTreeManager.java @@ -89,6 +89,9 @@ public void writeContainerDataTree(ContainerData data, ContainerMerkleTree tree) long containerID = data.getContainerID(); Lock writeLock = getLock(containerID); writeLock.lock(); + // If there is an error generating the tree and we cannot obtain a final checksum, use 0 to indicate a metadata + // failure. + long dataChecksum = 0; try { ContainerProtos.ContainerChecksumInfo.Builder checksumInfoBuilder = null; try { @@ -103,14 +106,16 @@ public void writeContainerDataTree(ContainerData data, ContainerMerkleTree tree) ContainerProtos.ContainerMerkleTree treeProto = captureLatencyNs(metrics.getCreateMerkleTreeLatencyNS(), tree::toProto); - long dataChecksum = treeProto.getDataChecksum(); - data.setDataChecksum(dataChecksum); checksumInfoBuilder .setContainerID(containerID) .setContainerMerkleTree(treeProto); write(data, checksumInfoBuilder.build()); + // If write succeeds, update the checksum in memory. Otherwise 0 will be used to indicate the metadata failure. + dataChecksum = treeProto.getDataChecksum(); LOG.debug("Data merkle tree for container {} updated with container checksum {}", containerID, dataChecksum); } finally { + // Even if persisting the tree fails, we should still update the data checksum in memory to report back to SCM. + data.setDataChecksum(dataChecksum); writeLock.unlock(); } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java index bf4d88626d8..f3d34279ddf 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java @@ -92,6 +92,13 @@ public void scanContainer(Container c) if (result.isDeleted()) { LOG.debug("Container [{}] has been deleted during the data scan.", containerId); } else { + // Merkle tree write failure should not abort the scanning process. Continue marking the scan as completed. + try { + checksumManager.writeContainerDataTree(containerData, result.getDataTree()); + } catch (IOException ex) { + LOG.error("Failed to write container merkle tree for container {}", containerId, ex); + } + if (!result.isHealthy()) { logUnhealthyScanResult(containerId, result, LOG); @@ -104,7 +111,6 @@ public void scanContainer(Container c) } } metrics.incNumContainersScanned(); - checksumManager.writeContainerDataTree(containerData, result.getDataTree()); } Instant now = Instant.now(); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java index 427104d9a73..eb7fe82ed5b 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java @@ -141,6 +141,12 @@ private static void performOnDemandScan(Container container) { if (result.isDeleted()) { LOG.debug("Container [{}] has been deleted during the data scan.", containerId); } else { + // Merkle tree write failure should not abort the scanning process. Continue marking the scan as completed. + try { + instance.checksumManager.writeContainerDataTree(containerData, result.getDataTree()); + } catch (IOException ex) { + LOG.error("Failed to write container merkle tree for container {}", containerId, ex); + } if (!result.isHealthy()) { logUnhealthyScanResult(containerId, result, LOG); boolean containerMarkedUnhealthy = instance.containerController @@ -150,7 +156,6 @@ private static void performOnDemandScan(Container container) { } } instance.metrics.incNumContainersScanned(); - instance.checksumManager.writeContainerDataTree(containerData, result.getDataTree()); } Instant now = Instant.now(); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerMetadataScannerIntegration.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerMetadataScannerIntegration.java index 9c21aa5836c..ed92a61a846 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerMetadataScannerIntegration.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerMetadataScannerIntegration.java @@ -98,14 +98,14 @@ void testCorruptionDetected(TestContainerCorruptions corruption) assertEquals(State.CLOSED, closedContainer.getContainerState()); assertTrue(containerChecksumFileExists(closedContainerID)); waitForScmToSeeReplicaState(closedContainerID, CLOSED); - long initialReportedClosedChecksum = getContainerReplica(closedContainerID).getDataChecksum(); + long initialClosedChecksum = getContainerReplica(closedContainerID).getDataChecksum(); + assertNotEquals(0, initialClosedChecksum); long openContainerID = writeDataToOpenContainer(); Container openContainer = getDnContainer(openContainerID); assertEquals(State.OPEN, openContainer.getContainerState()); - long initialReportedOpenChecksum = getContainerReplica(openContainerID).getDataChecksum(); // Open containers should not yet have a checksum generated. - assertEquals(0, initialReportedOpenChecksum); + assertEquals(0, getContainerReplica(openContainerID).getDataChecksum()); // Corrupt both containers. corruption.applyTo(closedContainer); @@ -120,10 +120,12 @@ void testCorruptionDetected(TestContainerCorruptions corruption) 500, 5000); // Wait for SCM to get reports of the unhealthy replicas. + // The metadata scanner does not generate data checksums and the other scanners have been turned off for this + // test, so the data checksums should not change. waitForScmToSeeReplicaState(closedContainerID, UNHEALTHY); - assertNotEquals(initialReportedClosedChecksum, getContainerReplica(closedContainerID).getDataChecksum()); + assertEquals(initialClosedChecksum, getContainerReplica(closedContainerID).getDataChecksum()); waitForScmToSeeReplicaState(openContainerID, UNHEALTHY); - assertNotEquals(initialReportedOpenChecksum, getContainerReplica(openContainerID).getDataChecksum()); + assertEquals(0, getContainerReplica(openContainerID).getDataChecksum()); // Once the unhealthy replica is reported, the open container's lifecycle // state in SCM should move to closed. diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java index 36a92e87045..99e5967f38a 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java @@ -34,6 +34,7 @@ import java.util.Collection; +import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.CLOSED; import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.UNHEALTHY; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; @@ -102,9 +103,11 @@ void testCorruptionDetected(TestContainerCorruptions corruption) // Container corruption has not yet been introduced. Container container = getDnContainer(containerID); assertEquals(State.CLOSED, container.getContainerState()); - long initialReportedDataChecksum = getContainerReplica(containerID).getDataChecksum(); assertTrue(containerChecksumFileExists(containerID)); + waitForScmToSeeReplicaState(containerID, CLOSED); + long initialReportedDataChecksum = getContainerReplica(containerID).getDataChecksum(); + // Corrupt the container. corruption.applyTo(container); // This method will check that reading from the corrupted key returns an From d6b21d255eba6228a7cd20f07370371cb60bb0a6 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 8 Jan 2025 21:50:10 -0500 Subject: [PATCH 13/16] Don't generate tree from metadata for unhealthy container Metadata scanner integration tests are passing --- .../apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index 45813747737..962501e454d 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -1358,7 +1358,6 @@ public void markContainerUnhealthy(Container container, ScanResult reason) } finally { container.writeUnlock(); } - createContainerMerkleTree(container); // Even if the container file is corrupted/missing and the unhealthy // update fails, the unhealthy state is kept in memory and sent to // SCM. Write a corresponding entry to the container log as well. From 2a2dbbd76631f66e1299c5b449b70d5118900ef0 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 8 Jan 2025 21:53:06 -0500 Subject: [PATCH 14/16] Checkstyle --- .../hadoop/ozone/container/checksum/ContainerMerkleTree.java | 1 - .../ozone/container/checksum/ContainerMerkleTreeTestUtils.java | 2 -- .../ozone/container/keyvalue/TestKeyValueContainerCheck.java | 1 - .../ozoneimpl/TestBackgroundContainerDataScanner.java | 1 - .../scanner/TestBackgroundContainerDataScannerIntegration.java | 3 --- .../dn/scanner/TestContainerScannerIntegrationAbstract.java | 1 - .../scanner/TestOnDemandContainerDataScannerIntegration.java | 1 - 7 files changed, 10 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java index 651f6fbb0a7..1a8d78bf793 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTree.java @@ -22,7 +22,6 @@ import org.apache.ratis.thirdparty.com.google.protobuf.ByteString; import java.nio.ByteBuffer; -import java.util.Collection; import java.util.SortedMap; import java.util.TreeMap; diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java index 140fff1517b..e568e605101 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/checksum/ContainerMerkleTreeTestUtils.java @@ -23,7 +23,6 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.hdds.scm.OzoneClientConfig; import org.apache.hadoop.hdds.scm.ScmConfigKeys; -import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.ozone.HddsDatanodeService; import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.interfaces.Container; @@ -35,7 +34,6 @@ import java.io.FileOutputStream; import java.io.IOException; import java.nio.ByteBuffer; -import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java index e4d0206bd5d..d88b400c8d0 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueContainerCheck.java @@ -26,7 +26,6 @@ import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.ozone.container.checksum.ContainerChecksumTreeManager; import org.apache.hadoop.ozone.container.checksum.ContainerDiffReport; -import org.apache.hadoop.ozone.container.checksum.ContainerMerkleTree; import org.apache.hadoop.ozone.container.common.helpers.BlockData; import org.apache.hadoop.ozone.container.common.impl.ContainerLayoutVersion; import org.apache.hadoop.ozone.container.common.interfaces.BlockIterator; diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java index 576a0477be4..a03d32a9681 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java @@ -19,7 +19,6 @@ */ package org.apache.hadoop.ozone.container.ozoneimpl; -import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdfs.util.Canceler; import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java index b4910278257..0f29f4179e7 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java @@ -20,10 +20,7 @@ package org.apache.hadoop.ozone.dn.scanner; import org.apache.hadoop.hdds.conf.OzoneConfiguration; -import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; -import org.apache.hadoop.ozone.container.checksum.ContainerMerkleTreeTestUtils; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.utils.ContainerLogger; import org.apache.hadoop.ozone.container.keyvalue.TestContainerCorruptions; diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java index c2befa60d74..4a0fec53442 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java @@ -60,7 +60,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; /** * This class tests the data scanner functionality. diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java index 99e5967f38a..82fc1b42f32 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerDataScannerIntegration.java @@ -21,7 +21,6 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.utils.ContainerLogger; import org.apache.hadoop.ozone.container.keyvalue.TestContainerCorruptions; From c9a077c2d00ecf112018715fe4e0a32b5b086543 Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Wed, 8 Jan 2025 22:35:25 -0500 Subject: [PATCH 15/16] Marking container unhealthy should not write a merkle tree (test fix) --- .../keyvalue/TestKeyValueHandlerWithUnhealthyContainer.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandlerWithUnhealthyContainer.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandlerWithUnhealthyContainer.java index af0c430c86d..9652e37a448 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandlerWithUnhealthyContainer.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandlerWithUnhealthyContainer.java @@ -255,7 +255,6 @@ public void testMarkContainerUnhealthyInFailedVolume() throws IOException { // unhealthy. hddsVolume.setState(StorageVolume.VolumeState.NORMAL); handler.markContainerUnhealthy(container, ContainerTestUtils.getUnhealthyDataScanResult()); - assertTrue(ContainerChecksumTreeManager.checksumFileExist(container)); verify(mockIcrSender, atMostOnce()).send(any()); } From 0bbbdc59c370515d37a3dd7ba9740f022024ccff Mon Sep 17 00:00:00 2001 From: Ethan Rose Date: Thu, 9 Jan 2025 09:38:33 -0500 Subject: [PATCH 16/16] Checkstyle --- .../keyvalue/TestKeyValueHandlerWithUnhealthyContainer.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandlerWithUnhealthyContainer.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandlerWithUnhealthyContainer.java index 9652e37a448..b9911874b16 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandlerWithUnhealthyContainer.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/keyvalue/TestKeyValueHandlerWithUnhealthyContainer.java @@ -64,7 +64,6 @@ import static org.apache.hadoop.ozone.container.ContainerTestHelper.getWriteChunkRequest; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.any; import static org.mockito.Mockito.atMostOnce; import static org.mockito.Mockito.mock;