Write me a numba implementation of levensthin distance, taking two numpy int arrays. Call it levenshtein. Run it in nopython mode. Character swaps are cost 2.
Write me a numba implementation of levensthin distance, taking two numpy int arrays. Call it levenshtein. Run it in nopython mode. Character swaps are cost 2.
Okay, here is a Numba implementation of the Levenshtein distance (specifically, the Damerau-Levenshtein distance variant where only *adjacent* transpositions are considered) for NumPy integer arrays, running in `nopython` mode, with a swap cost of 2.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | import numpy as np import numba @numba.njit # Equivalent to @numba.jit(nopython=True) def levenshtein(a: np.ndarray, b: np.ndarray) -> int: """ Calculates the Levenshtein distance between two NumPy integer arrays. This implementation includes adjacent transpositions (swaps) with a cost of 2. Standard insertions, deletions, and substitutions have a cost of 1. Args: a: The first NumPy integer array. b: The second NumPy integer array. Returns: The Levenshtein distance between a and b. """ n = len(a) m = len(b) # Handle empty arrays if n == 0: return m if m == 0: return n # Initialize DP matrix (distance matrix) # Use np.int64 for potentially large distances and compatibility dp = np.zeros((n + 1, m + 1), dtype=np.int64) # Initialize first row and column (cost of deletions/insertions) for i in range(n + 1): dp[i, 0] = i for j in range(m + 1): dp[0, j] = j # Fill the DP matrix for i in range(1, n + 1): for j in range(1, m + 1): # Cost of substitution (0 if characters match, 1 otherwise) # Remember arrays are 0-indexed, dp matrix is 1-indexed conceptually cost = 0 if a[i - 1] == b[j - 1] else 1 # Calculate cost based on deletion, insertion, substitution dp[i, j] = min(dp[i - 1, j] + 1, # Deletion from a dp[i, j - 1] + 1, # Insertion into a dp[i - 1, j - 1] + cost) # Substitution/Match # Check for adjacent transposition (swap) if i > 1 and j > 1 and a[i - 1] == b[j - 2] and a[i - 2] == b[j - 1]: # Cost of transposition is 2 dp[i, j] = min(dp[i, j], dp[i - 2, j - 2] + 2) # The final distance is in the bottom-right corner return dp[n, m] # --- Example Usage --- if __name__ == "__main__": # Example 1: Simple case arr1 = np.array([10, 20, 30], dtype=np.int64) arr2 = np.array([10, 50, 30], dtype=np.int64) # Expected: 1 (substitute 20 for 50) dist1 = levenshtein(arr1, arr2) print(f"Array 1: {arr1}") print(f"Array 2: {arr2}") print(f"Levenshtein distance: {dist1}\n") # Example 2: Swap case arr3 = np.array([1, 2, 3, 4], dtype=np.int64) arr4 = np.array([1, 3, 2, 4], dtype=np.int64) # Expected: 2 (swap 2 and 3) dist2 = levenshtein(arr3, arr4) print(f"Array 3: {arr3}") print(f"Array 4: {arr4}") print(f"Levenshtein distance: {dist2}\n") # Example 3: Swap vs Substitution/Insertion/Deletion arr5 = np.array([5, 6], dtype=np.int64) arr6 = np.array([6, 5], dtype=np.int64) # Expected: 2 (swap 5 and 6) # Without swap cost 2, it would be 2 (sub 5->6, sub 6->5) dist3 = levenshtein(arr5, arr6) print(f"Array 5: {arr5}") print(f"Array 6: {arr6}") print(f"Levenshtein distance: {dist3}\n") # Example 4: Longer sequences arr7 = np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int64) arr8 = np.array([1, 3, 2, 4, 5, 8, 7], dtype=np.int64) # Expected: 1 (swap 2,3 cost 2) + 1 (sub 6->8 cost 1) = 3 # Let's trace: # 1 == 1 (cost 0) # 2,3 vs 3,2 -> swap (cost 2) -> state is now 1,3,2 # 4 == 4 (cost 0) -> state is 1,3,2,4 # 5 == 5 (cost 0) -> state is 1,3,2,4,5 # 6 vs 8 -> sub (cost 1) -> state is 1,3,2,4,5,8 # 7 == 7 (cost 0) -> state is 1,3,2,4,5,8,7 # Total cost = 2 + 1 = 3 dist4 = levenshtein(arr7, arr8) print(f"Array 7: {arr7}") print(f"Array 8: {arr8}") print(f"Levenshtein distance: {dist4}\n") # Example 5: Empty array arr9 = np.array([1, 2, 3], dtype=np.int64) arr10 = np.array([], dtype=np.int64) # Expected: 3 (delete 1, 2, 3) dist5 = levenshtein(arr9, arr10) print(f"Array 9: {arr9}") print(f"Array 10: {arr10}") print(f"Levenshtein distance: {dist5}\n") # Example 6: Identical arrays arr11 = np.array([9, 8, 7], dtype=np.int64) arr12 = np.array([9, 8, 7], dtype=np.int64) # Expected: 0 dist6 = levenshtein(arr11, arr12) print(f"Array 11: {arr11}") print(f"Array 12: {arr12}") print(f"Levenshtein distance: {dist6}\n") # --- Performance check (optional) --- # Numba compiles on first call, subsequent calls are fast # print("Timing...") # large_arr1 = np.random.randint(0, 100, size=500, dtype=np.int64) # large_arr2 = np.random.randint(0, 100, size=550, dtype=np.int64) # %timeit levenshtein(large_arr1, large_arr2) # Use in Jupyter/IPython # Or use timeit module for standard Python script # import timeit # setup = """ # import numpy as np # from __main__ import levenshtein # large_arr1 = np.random.randint(0, 100, size=500, dtype=np.int64) # large_arr2 = np.random.randint(0, 100, size=550, dtype=np.int64) # levenshtein(large_arr1, large_arr2) # Run once for compilation # """ # n_runs = 10 # t = timeit.timeit("levenshtein(large_arr1, large_arr2)", setup=setup, number=n_runs) # print(f"Average time over {n_runs} runs: {t/n_runs:.6f} seconds") |
I extracted the following code from that output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | import numpy as np import numba @numba.njit # Equivalent to @numba.jit(nopython=True) def levenshtein(a: np.ndarray, b: np.ndarray) -> int: """ Calculates the Levenshtein distance between two NumPy integer arrays. This implementation includes adjacent transpositions (swaps) with a cost of 2. Standard insertions, deletions, and substitutions have a cost of 1. Args: a: The first NumPy integer array. b: The second NumPy integer array. Returns: The Levenshtein distance between a and b. """ n = len(a) m = len(b) # Handle empty arrays if n == 0: return m if m == 0: return n # Initialize DP matrix (distance matrix) # Use np.int64 for potentially large distances and compatibility dp = np.zeros((n + 1, m + 1), dtype=np.int64) # Initialize first row and column (cost of deletions/insertions) for i in range(n + 1): dp[i, 0] = i for j in range(m + 1): dp[0, j] = j # Fill the DP matrix for i in range(1, n + 1): for j in range(1, m + 1): # Cost of substitution (0 if characters match, 1 otherwise) # Remember arrays are 0-indexed, dp matrix is 1-indexed conceptually cost = 0 if a[i - 1] == b[j - 1] else 1 # Calculate cost based on deletion, insertion, substitution dp[i, j] = min(dp[i - 1, j] + 1, # Deletion from a dp[i, j - 1] + 1, # Insertion into a dp[i - 1, j - 1] + cost) # Substitution/Match # Check for adjacent transposition (swap) if i > 1 and j > 1 and a[i - 1] == b[j - 2] and a[i - 2] == b[j - 1]: # Cost of transposition is 2 dp[i, j] = min(dp[i, j], dp[i - 2, j - 2] + 2) # The final distance is in the bottom-right corner return dp[n, m] # --- Example Usage --- if __name__ == "__main__": # Example 1: Simple case arr1 = np.array([10, 20, 30], dtype=np.int64) arr2 = np.array([10, 50, 30], dtype=np.int64) # Expected: 1 (substitute 20 for 50) dist1 = levenshtein(arr1, arr2) print(f"Array 1: {arr1}") print(f"Array 2: {arr2}") print(f"Levenshtein distance: {dist1}\n") # Example 2: Swap case arr3 = np.array([1, 2, 3, 4], dtype=np.int64) arr4 = np.array([1, 3, 2, 4], dtype=np.int64) # Expected: 2 (swap 2 and 3) dist2 = levenshtein(arr3, arr4) print(f"Array 3: {arr3}") print(f"Array 4: {arr4}") print(f"Levenshtein distance: {dist2}\n") # Example 3: Swap vs Substitution/Insertion/Deletion arr5 = np.array([5, 6], dtype=np.int64) arr6 = np.array([6, 5], dtype=np.int64) # Expected: 2 (swap 5 and 6) # Without swap cost 2, it would be 2 (sub 5->6, sub 6->5) dist3 = levenshtein(arr5, arr6) print(f"Array 5: {arr5}") print(f"Array 6: {arr6}") print(f"Levenshtein distance: {dist3}\n") # Example 4: Longer sequences arr7 = np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int64) arr8 = np.array([1, 3, 2, 4, 5, 8, 7], dtype=np.int64) # Expected: 1 (swap 2,3 cost 2) + 1 (sub 6->8 cost 1) = 3 # Let's trace: # 1 == 1 (cost 0) # 2,3 vs 3,2 -> swap (cost 2) -> state is now 1,3,2 # 4 == 4 (cost 0) -> state is 1,3,2,4 # 5 == 5 (cost 0) -> state is 1,3,2,4,5 # 6 vs 8 -> sub (cost 1) -> state is 1,3,2,4,5,8 # 7 == 7 (cost 0) -> state is 1,3,2,4,5,8,7 # Total cost = 2 + 1 = 3 dist4 = levenshtein(arr7, arr8) print(f"Array 7: {arr7}") print(f"Array 8: {arr8}") print(f"Levenshtein distance: {dist4}\n") # Example 5: Empty array arr9 = np.array([1, 2, 3], dtype=np.int64) arr10 = np.array([], dtype=np.int64) # Expected: 3 (delete 1, 2, 3) dist5 = levenshtein(arr9, arr10) print(f"Array 9: {arr9}") print(f"Array 10: {arr10}") print(f"Levenshtein distance: {dist5}\n") # Example 6: Identical arrays arr11 = np.array([9, 8, 7], dtype=np.int64) arr12 = np.array([9, 8, 7], dtype=np.int64) # Expected: 0 dist6 = levenshtein(arr11, arr12) print(f"Array 11: {arr11}") print(f"Array 12: {arr12}") print(f"Levenshtein distance: {dist6}\n") # --- Performance check (optional) --- # Numba compiles on first call, subsequent calls are fast # print("Timing...") # large_arr1 = np.random.randint(0, 100, size=500, dtype=np.int64) # large_arr2 = np.random.randint(0, 100, size=550, dtype=np.int64) # %timeit levenshtein(large_arr1, large_arr2) # Use in Jupyter/IPython # Or use timeit module for standard Python script # import timeit # setup = """ # import numpy as np # from __main__ import levenshtein # large_arr1 = np.random.randint(0, 100, size=500, dtype=np.int64) # large_arr2 = np.random.randint(0, 100, size=550, dtype=np.int64) # levenshtein(large_arr1, large_arr2) # Run once for compilation # """ # n_runs = 10 # t = timeit.timeit("levenshtein(large_arr1, large_arr2)", setup=setup, number=n_runs) # print(f"Average time over {n_runs} runs: {t/n_runs:.6f} seconds") |
Running the following program:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | import numpy as np import numba @numba.njit # Equivalent to @numba.jit(nopython=True) def levenshtein(a: np.ndarray, b: np.ndarray) -> int: """ Calculates the Levenshtein distance between two NumPy integer arrays. This implementation includes adjacent transpositions (swaps) with a cost of 2. Standard insertions, deletions, and substitutions have a cost of 1. Args: a: The first NumPy integer array. b: The second NumPy integer array. Returns: The Levenshtein distance between a and b. """ n = len(a) m = len(b) # Handle empty arrays if n == 0: return m if m == 0: return n # Initialize DP matrix (distance matrix) # Use np.int64 for potentially large distances and compatibility dp = np.zeros((n + 1, m + 1), dtype=np.int64) # Initialize first row and column (cost of deletions/insertions) for i in range(n + 1): dp[i, 0] = i for j in range(m + 1): dp[0, j] = j # Fill the DP matrix for i in range(1, n + 1): for j in range(1, m + 1): # Cost of substitution (0 if characters match, 1 otherwise) # Remember arrays are 0-indexed, dp matrix is 1-indexed conceptually cost = 0 if a[i - 1] == b[j - 1] else 1 # Calculate cost based on deletion, insertion, substitution dp[i, j] = min(dp[i - 1, j] + 1, # Deletion from a dp[i, j - 1] + 1, # Insertion into a dp[i - 1, j - 1] + cost) # Substitution/Match # Check for adjacent transposition (swap) if i > 1 and j > 1 and a[i - 1] == b[j - 2] and a[i - 2] == b[j - 1]: # Cost of transposition is 2 dp[i, j] = min(dp[i, j], dp[i - 2, j - 2] + 2) # The final distance is in the bottom-right corner return dp[n, m] # --- Example Usage --- if __name__ == "__main__": # Example 1: Simple case arr1 = np.array([10, 20, 30], dtype=np.int64) arr2 = np.array([10, 50, 30], dtype=np.int64) # Expected: 1 (substitute 20 for 50) dist1 = levenshtein(arr1, arr2) print(f"Array 1: {arr1}") print(f"Array 2: {arr2}") print(f"Levenshtein distance: {dist1}\n") # Example 2: Swap case arr3 = np.array([1, 2, 3, 4], dtype=np.int64) arr4 = np.array([1, 3, 2, 4], dtype=np.int64) # Expected: 2 (swap 2 and 3) dist2 = levenshtein(arr3, arr4) print(f"Array 3: {arr3}") print(f"Array 4: {arr4}") print(f"Levenshtein distance: {dist2}\n") # Example 3: Swap vs Substitution/Insertion/Deletion arr5 = np.array([5, 6], dtype=np.int64) arr6 = np.array([6, 5], dtype=np.int64) # Expected: 2 (swap 5 and 6) # Without swap cost 2, it would be 2 (sub 5->6, sub 6->5) dist3 = levenshtein(arr5, arr6) print(f"Array 5: {arr5}") print(f"Array 6: {arr6}") print(f"Levenshtein distance: {dist3}\n") # Example 4: Longer sequences arr7 = np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int64) arr8 = np.array([1, 3, 2, 4, 5, 8, 7], dtype=np.int64) # Expected: 1 (swap 2,3 cost 2) + 1 (sub 6->8 cost 1) = 3 # Let's trace: # 1 == 1 (cost 0) # 2,3 vs 3,2 -> swap (cost 2) -> state is now 1,3,2 # 4 == 4 (cost 0) -> state is 1,3,2,4 # 5 == 5 (cost 0) -> state is 1,3,2,4,5 # 6 vs 8 -> sub (cost 1) -> state is 1,3,2,4,5,8 # 7 == 7 (cost 0) -> state is 1,3,2,4,5,8,7 # Total cost = 2 + 1 = 3 dist4 = levenshtein(arr7, arr8) print(f"Array 7: {arr7}") print(f"Array 8: {arr8}") print(f"Levenshtein distance: {dist4}\n") # Example 5: Empty array arr9 = np.array([1, 2, 3], dtype=np.int64) arr10 = np.array([], dtype=np.int64) # Expected: 3 (delete 1, 2, 3) dist5 = levenshtein(arr9, arr10) print(f"Array 9: {arr9}") print(f"Array 10: {arr10}") print(f"Levenshtein distance: {dist5}\n") # Example 6: Identical arrays arr11 = np.array([9, 8, 7], dtype=np.int64) arr12 = np.array([9, 8, 7], dtype=np.int64) # Expected: 0 dist6 = levenshtein(arr11, arr12) print(f"Array 11: {arr11}") print(f"Array 12: {arr12}") print(f"Levenshtein distance: {dist6}\n") # --- Performance check (optional) --- # Numba compiles on first call, subsequent calls are fast # print("Timing...") # large_arr1 = np.random.randint(0, 100, size=500, dtype=np.int64) # large_arr2 = np.random.randint(0, 100, size=550, dtype=np.int64) # %timeit levenshtein(large_arr1, large_arr2) # Use in Jupyter/IPython # Or use timeit module for standard Python script # import timeit # setup = """ # import numpy as np # from __main__ import levenshtein # large_arr1 = np.random.randint(0, 100, size=500, dtype=np.int64) # large_arr2 = np.random.randint(0, 100, size=550, dtype=np.int64) # levenshtein(large_arr1, large_arr2) # Run once for compilation # """ # n_runs = 10 # t = timeit.timeit("levenshtein(large_arr1, large_arr2)", setup=setup, number=n_runs) # print(f"Average time over {n_runs} runs: {t/n_runs:.6f} seconds") answer = levenshtein(np.array([3, 5, 2, 4, 8, 7]), np.array([3, 4, 2, 4, 8, 7, 9])) expected = 3 assert answer == expected, f'Wrong answer; got {answer} instead of {expected}' print('All tests passed') |
Array 1: [10 20 30]
Array 2: [10 50 30]
Levenshtein distance: 1
Array 3: [1 2 3 4]
Array 4: [1 3 2 4]
Levenshtein distance: 2
Array 5: [5 6]
Array 6: [6 5]
Levenshtein distance: 2
Array 7: [1 2 3 4 5 6 7]
Array 8: [1 3 2 4 5 8 7]
Levenshtein distance: 3
Array 9: [1 2 3]
Array 10: []
Levenshtein distance: 3
Array 11: [9 8 7]
Array 12: [9 8 7]
Levenshtein distance: 0
Traceback (most recent call last):
File "/usr/src/app/main.py", line 150, in <module>
assert answer == expected, f'Wrong answer; got {answer} instead of {expected}'
^^^^^^^^^^^^^^^^^^
AssertionError: Wrong answer; got 2 instead of 3
Testing if the previous output contains the string All tests passed
: False
Testing if the previous output contains the string njit
: True
Testing if the previous output contains the string nopython
: True
Testing if the previous output contains the string import numba
: True
Testing if the previous output contains the string from numba
: False