Skip to content

Commit

Permalink
Makes duplicate detector more configurable and adds integration test
Browse files Browse the repository at this point in the history
Signed-off-by: Christopher Grote <cmgrote@users.noreply.github.com>
  • Loading branch information
cmgrote committed Nov 30, 2023
1 parent 40499d0 commit bdd00a3
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import com.atlan.model.assets.IColumn
import com.atlan.model.assets.MaterializedView
import com.atlan.model.assets.Table
import com.atlan.model.assets.View
import com.atlan.model.enums.AtlanIcon
import com.atlan.model.enums.CertificateStatus
import com.atlan.model.search.CompoundQuery
import com.atlan.pkg.Utils
Expand All @@ -22,8 +23,6 @@ import java.util.concurrent.atomic.AtomicLong
object DuplicateDetector {
private val logger = KotlinLogging.logger {}

private const val GLOSSARY_NAME = "Duplicate assets"

data class AssetKey(val typeName: String, val qualifiedName: String, val guid: String)

private val hashToAssetKeys = ConcurrentHashMap<Int, MutableSet<AssetKey>>()
Expand All @@ -34,6 +33,7 @@ object DuplicateDetector {
fun main(args: Array<String>) {
val config = Utils.setPackageOps<DuplicateDetectorCfg>()

val glossaryName = Utils.getOrDefault(config.glossaryName, "Duplicate assets")
val qnPrefix = Utils.getOrDefault(config.qnPrefix, "default")
val types =
Utils.getOrDefault(config.assetTypes, listOf(Table.TYPE_NAME, View.TYPE_NAME, MaterializedView.TYPE_NAME))
Expand All @@ -44,7 +44,7 @@ object DuplicateDetector {
}
findAssets(qnPrefix, types, batchSize)

val glossaryQN = glossaryForDuplicates()
val glossaryQN = glossaryForDuplicates(glossaryName)
termsForDuplicates(glossaryQN, batchSize)
}

Expand Down Expand Up @@ -94,14 +94,16 @@ object DuplicateDetector {
/**
* Idempotently create (or fetch) a glossary to capture the duplicate assets.
*
* @param glossaryName name of the glossary
* @return the qualifiedName of the glossary
*/
fun glossaryForDuplicates(): String {
fun glossaryForDuplicates(glossaryName: String): String {
return try {
Glossary.findByName(GLOSSARY_NAME).qualifiedName
Glossary.findByName(glossaryName).qualifiedName
} catch (e: NotFoundException) {
val glossary = Glossary.creator(GLOSSARY_NAME)
.description("Glossary whose terms represent potential duplicate assets.")
val glossary = Glossary.creator(glossaryName)
.assetIcon(AtlanIcon.COPY)
.userDescription("Each term represents a set of potential duplicate assets, based on assets that have the same set of columns (case-insensitive, in any order). The assets that are potential duplicates of each other are all linked to the same term.")
.build()
logger.info { "Creating glossary to hold duplicates." }
glossary.save().getResult(glossary).qualifiedName
Expand Down Expand Up @@ -140,7 +142,7 @@ object DuplicateDetector {
} catch (e: NotFoundException) {
val toCreate = GlossaryTerm.creator(termName, glossaryQN)
.description(
"Assets with the same set of ${columns?.size} columns:\n" + columns?.joinToString(
"Assets with the same set of ${columns?.size} columns:\n" + columns?.joinToString(
separator = "\n",
) { "- $it" },
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import javax.annotation.processing.Generated
@Generated("com.atlan.pkg.CustomPackage")
@JsonAutoDetect(fieldVisibility = JsonAutoDetect.Visibility.ANY)
data class DuplicateDetectorCfg(
@JsonProperty("glossary_name") val glossaryName: String?,
@JsonProperty("qn_prefix") val qnPrefix: String?,
@JsonProperty("control_config_strategy") val controlConfigStrategy: String?,
@JsonDeserialize(using = WidgetSerde.MultiSelectDeserializer::class)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.atlan.Atlan
import com.atlan.pkg.CustomPackage
import com.atlan.pkg.config.model.ui.UIConfig
import com.atlan.pkg.config.model.ui.UIRule
import com.atlan.pkg.config.model.ui.UIStep
import com.atlan.pkg.config.model.workflow.WorkflowOutputs
import com.atlan.pkg.config.widgets.DropDown
Expand All @@ -24,12 +25,19 @@ object PackageConfig : CustomPackage(
title = "Configuration",
description = "Duplicate detection configuration",
inputs = mapOf(
"glossary_name" to TextInput(
label = "Glossary name",
required = true,
help = "Name for the glossary where the duplicate sets of assets will be recorded and tracked.",
placeholder = "Duplicate assets",
grid = 4,
),
"qn_prefix" to TextInput(
label = "Qualified name prefix",
required = false,
help = "Starting value for a qualifiedName that will determine which assets to check for duplicates.",
placeholder = "default",
grid = 6,
grid = 4,
),
"control_config_strategy" to Radio(
label = "Options",
Expand All @@ -43,7 +51,7 @@ object PackageConfig : CustomPackage(
),
"asset_types" to DropDown(
label = "Asset types",
required = true,
required = false,
possibleValues = mapOf(
"Table" to "Table",
"View" to "View",
Expand All @@ -56,6 +64,12 @@ object PackageConfig : CustomPackage(
),
),
),
rules = listOf(
UIRule(
whenInputs = mapOf("control_config_strategy" to "advanced"),
required = listOf("asset_types"),
),
),
),
containerImage = "ghcr.io/atlanhq/csa-duplicate-detector:${Atlan.VERSION}",
containerImagePullPolicy = "Always",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/* SPDX-License-Identifier: Apache-2.0
Copyright 2023 Atlan Pte. Ltd. */
import com.atlan.Atlan
import com.atlan.model.assets.Glossary
import com.atlan.model.assets.GlossaryTerm
import com.atlan.model.assets.MaterializedView
import com.atlan.model.assets.Table
import com.atlan.model.assets.View
import com.atlan.model.enums.AtlanDeleteType
import com.atlan.model.enums.CertificateStatus
import com.atlan.pkg.PackageTest
import org.testng.Assert.assertTrue
import org.testng.ITestContext
import org.testng.annotations.AfterClass
import org.testng.annotations.BeforeClass
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertNotNull

/**
* Test detection of duplicate assets.
*/
class DuplicateDetectorTest : PackageTest() {

private val testId = makeUnique("dupdetect")
private val files = listOf(
"debug.log",
)
private val glossaryName = testId

@BeforeClass
fun beforeClass() {
setup(
DuplicateDetectorCfg(
glossaryName = glossaryName,
qnPrefix = "default/snowflake",
controlConfigStrategy = "default",
assetTypes = listOf(Table.TYPE_NAME, View.TYPE_NAME, MaterializedView.TYPE_NAME),
),
)
DuplicateDetector.main(arrayOf())
}

@Test
fun glossaryCreated() {
val glossary = Glossary.findByName(glossaryName)
assertNotNull(glossary)
assertEquals(testId, glossary.name)
}

@Test
fun termsCreated() {
val glossaryQN = Glossary.findByName(glossaryName).qualifiedName!!
val terms = GlossaryTerm.select()
.where(GlossaryTerm.ANCHOR.eq(glossaryQN))
.includeOnResults(GlossaryTerm.DESCRIPTION)
.includeOnResults(GlossaryTerm.ASSIGNED_ENTITIES)
.includeOnResults(GlossaryTerm.CERTIFICATE_STATUS)
.stream()
.toList()
assertTrue(terms.size > 0)
terms.forEach { term ->
term as GlossaryTerm
assertTrue(term.name.startsWith("Dup. ("))
assertTrue(term.assignedEntities.size > 0)
assertTrue(term.description.startsWith("Assets with the same set of"))
assertEquals(CertificateStatus.DRAFT, term.certificateStatus)
}
}

@Test
fun filesCreated() {
validateFilesExist(files)
}

@Test
fun errorFreeLog() {
validateErrorFreeLog()
}

@AfterClass(alwaysRun = true)
fun afterClass(context: ITestContext) {
val glossary = Glossary.findByName(glossaryName)!!
val terms = GlossaryTerm.select()
.where(GlossaryTerm.ANCHOR.eq(glossary.qualifiedName))
.stream()
.map { it.guid }
.toList()
Atlan.getDefaultClient().assets.delete(terms, AtlanDeleteType.PURGE)
Glossary.purge(Glossary.findByName(glossaryName).guid)
teardown(context.failedTests.size() > 0)
}
}

0 comments on commit bdd00a3

Please sign in to comment.