add digit swap detection solution

treasure-data · Dec 30, 2024 · 4658ab0 · 4658ab0
1 parent 3a7624b
commit 4658ab0
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 0 deletions.
diff --git a/data-box/digit_swap/README.md b/data-box/digit_swap/README.md
@@ -0,0 +1,22 @@
+# Consecutive digit swap
+
+----
+## Overview
+
+This project provides a solution to detecting consecutive digit swaps in phone numbers.
+E.g.: 070 1234 5678 vs 070 2134 5678
+
+----
+## Implementation
+1. Modify this code to run on custom script Python container. (https://docs.treasuredata.com/articles/#!pd/python-custom-scripting-example)
+2. Copy and paste the code into a custom script in Treasure Workflows.
+
+----
+## Considerations
+
+This project can be used to detect any consecutive character swaps, e.g.: email, username etc.
+
+----
+## Questions
+
+Please feel free to reach out to apac-se@treasure-data.com with any questions you have about using this code.
diff --git a/data-box/digit_swap/digit_swap.py b/data-box/digit_swap/digit_swap.py
@@ -0,0 +1,37 @@
+import pandas as pd
+
+def check_consecutive_digit_swap():
+    df = pd.read_csv('data.csv', dtype=str)
+    df = df.reset_index()  # Make sure indexes pair with number of rows.
+    cnt = 0
+
+    # init result csv
+    f = open('res.csv','w+')
+    f.write('phone1,phone2\n')
+    f.close()
+
+    f = open('res.csv', 'w')
+
+    for index, row in df.iterrows():
+        phone1 = row['ph1']
+        phone2 = row['ph2']
+
+        # Check if lengths are the same.
+        if len(phone1) == len(phone2):
+
+            # Find differing positions.
+            differing_positions = [i for i in range(len(phone1)) if phone1[i] != phone2[i]]
+
+            # Check if there are exactly two differing positions, and that they are consecutive.
+            if len(differing_positions) == 2:
+                i, j = differing_positions
+                if (j == i + 1 and phone1[i] == phone2[j] and phone1[j] == phone2[i]):
+                    cnt = cnt + 1
+                    f.write(phone1 + ',' + phone2 + '\n')
+                    #print(phone1, phone2, (j == i + 1
+                    #    and phone1[i] == phone2[j]
+                    #    and phone1[j] == phone2[i]))
+    print(str(cnt))
+    f.close()
+
+check_consecutive_digit_swap()