Longest Common Subsequence implemented in VBA (Visual Basic for Applications)

From Wikipedia, The longest common subsequence (LCS) problem is to find the longest subsequence common to all sequences in a set of sequences (often just two).

The following is a VBA implementation of this problem. The following functions are included;

String functions;

  • longestCommonSubsequence – calculate an LCS array.
  • backTraceUp and backTraceLeft – trace back either defaulting up or left respectively, and find the LCS.
  • getDiff – returns the difference of the two strings. The succeeding character of =, – or + indicated if the character was equal, removed or added, respectively.
  • passGetDiffOutput – passes the output of getDiff so that =, – or + are now values in a 2 x n array, with indix 0 being equal, indix 1 being removed and indix 2 being added.

Array functions;

  • longestCommonSubsequenceArr – calculate an LCS array.
  • backTraceUpArr – trace back defaulting up and find the LCS.
  • getDiffArr – returns the difference of the two arrays as a 2 x n array, with indix 0 being equal, indix 1 being removed and indix 2 being added.

Common functions;

  • max – standard maximum function.
  • stringToArray – convert a string to an array for array functions.

Examples;

  • exampleString
  • exampleArr

Download the Basic (bas) File

Unfortunately, the limitations of VBA makes a dog’s dinner out of what would be some very concise code or, perhaps that’s just my implementation…

Option Explicit

Public Function longestCommonSubsequence(ByRef string1 As String, ByRef string2 As String) As Long()
    If string1 = vbNullString Or string2 = vbNullString Then
        Exit Function
    End If
 
    Dim num() As Long
    
    'define the array, note rows of zeros get added to front automatically
    ReDim num(Len(string1), Len(string2))
    
    Dim i As Long, j As Long
    
    For i = 1 To Len(string1)
        For j = 1 To Len(string2)
            If Mid$(string1, i, 1) = Mid$(string2, j, 1) Then
                num(i, j) = num(i - 1, j - 1) + 1
            Else
                num(i, j) = max(num(i - 1, j), num(i, j - 1))
            End If
        Next j
    Next i

    longestCommonSubsequence = num
End Function

Sub exampleString()

    Dim arr() As Long
    
    Dim string1 As String
    Dim string2 As String
    
    string1 = "this is a find the haystack string"
    string2 = "this is a replace the needle string"
    
    arr = longestCommonSubsequence(string1, string2)
    
    Dim s As String, t As String
    s = backTraceUp(arr, string1, string2, Len(string1), Len(string2))
    t = backTraceLeft(arr, string1, string2, Len(string1), Len(string2))
    Dim a As String, b As String
    a = getDiff(arr, string1, string2, Len(string1), Len(string2))
    
    Dim brr() As Long
    
    brr = passGetDiffOutput(a)
End Sub

Public Function max(ByRef a As Long, ByRef b As Long) As Long
    If a >= b Then
        max = a
    Else
        max = b
    End If
End Function


'back traces c, defaulting in the up direction
Public Function backTraceUp(ByRef c() As Long, ByRef string1 As String, ByRef string2 As String, ByRef i As Long, ByRef j As Long) As String
    If i < 1 Or j < 1 Then
        backTraceUp = vbNullString
    ElseIf Mid$(string1, i, 1) = Mid$(string2, j, 1) Then
        'equal characters, save it and then go up and left
        backTraceUp = backTraceUp(c, string1, string2, i - 1, j - 1) & Mid$(string1, i, 1)
    Else
        'go in the direction of the highest number, defaulting to up
        If (c(i, j - 1) > c(i - 1, j)) Then
            backTraceUp = backTraceUp(c, string1, string2, i, j - 1)
        Else
            backTraceUp = backTraceUp(c, string1, string2, i - 1, j)
        End If
    End If
End Function

'back traces c, defaulting in the left direction
Public Function backTraceLeft(ByRef c() As Long, ByRef string1 As String, ByRef string2 As String, ByRef i As Long, ByRef j As Long) As String
    If i < 1 Or j < 1 Then
        backTraceLeft = vbNullString
    ElseIf Mid$(string1, i, 1) = Mid$(string2, j, 1) Then
        'equal characters, save it and then go up and left
        backTraceLeft = backTraceLeft(c, string1, string2, i - 1, j - 1) & Mid$(string1, i, 1)
    Else
        'go in the direction of the highest number, defaulting to left
        If (c(i, j - 1) >= c(i - 1, j)) Then
            backTraceLeft = backTraceLeft(c, string1, string2, i, j - 1)
        Else
            backTraceLeft = backTraceLeft(c, string1, string2, i - 1, j)
        End If
    End If
End Function

'the following function returns a string with indication to what was deleted or added
'proceding character can be;
' = no change
' - deletion
' + addition
Public Function getDiff(ByRef c() As Long, ByRef stringOld As String, ByRef stringNew As String, ByRef i As Long, ByRef j As Long) As String
    If i > 0 Then
        If j > 0 Then 'both are greater than zero
            'can only do the following comparison when i and j are greater than zero
            If Mid$(stringOld, i, 1) = Mid$(stringNew, j, 1) Then
                getDiff = getDiff(c, stringOld, stringNew, i - 1, j - 1) & Mid$(stringOld, i, 1) & "="
            Else
                If i = 0 Then
                    getDiff = getDiff(c, stringOld, stringNew, i, j - 1) & Mid$(stringNew, j, 1) & "+"
                ElseIf c(i, j - 1) >= c(i - 1, j) Then
                    getDiff = getDiff(c, stringOld, stringNew, i, j - 1) & Mid$(stringNew, j, 1) & "+"
                ElseIf j = 0 Then
                    getDiff = getDiff(c, stringOld, stringNew, i - 1, j) & Mid$(stringOld, i, 1) & "-"
                ElseIf c(i, j - 1) < c(i - 1, j) Then
                    getDiff = getDiff(c, stringOld, stringNew, i - 1, j) & Mid$(stringOld, i, 1) & "-"
                Else
                    getDiff = vbNullString
                End If
            End If
        Else 'i is is greater than zero
                If j = 0 Then
                    getDiff = getDiff(c, stringOld, stringNew, i - 1, j) & Mid$(stringOld, i, 1) & "-"
                ElseIf c(i, j - 1) < c(i - 1, j) Then
                    getDiff = getDiff(c, stringOld, stringNew, i - 1, j) & Mid$(stringOld, i, 1) & "-"
                Else
                    getDiff = vbNullString
                End If
        End If
    Else
        If j > 0 Then 'j is  greater than zero
                If i = 0 Then
                    getDiff = getDiff(c, stringOld, stringNew, i, j - 1) & Mid$(stringNew, j, 1) & "+"
                ElseIf c(i, j - 1) >= c(i - 1, j) Then
                    getDiff = getDiff(c, stringOld, stringNew, i, j - 1) & Mid$(stringNew, j, 1) & "+"
                Else
                    getDiff = vbNullString
                End If
        Else 'none are greater than zero
                getDiff = vbNullString
        End If
    End If
End Function

'this function returns the location of the string difference
Public Function passGetDiffOutput(ByRef outputStr As String) As Long()
    Dim i As Long
    i = 1
    
    Dim typeChr As String
    
    
    Dim oldi As Long
    Dim newi As Long
    oldi = 0
    newi = 0
    
    Dim toFrom() As Long
    Dim toFromCount As Long
    toFromCount = -1
    
    
    Dim typeChrPrev As String
    typeChrPrev = vbNullString
    
    Do While i < Len(outputStr)
        typeChr = Mid$(outputStr, i + 1, 1)
        Select Case typeChr
            Case "="
                If typeChr <> typeChrPrev Then
                
                    'check if it is comming from a deletion
                    If typeChrPrev = "-" Then
                        toFrom(2, toFromCount) = oldi
                    End If
                    
                    'check if it is comming from a addition
                    If typeChrPrev = "+" Then
                        toFrom(2, toFromCount) = newi
                    End If
                End If
                
                oldi = oldi + 1 'update old index
                newi = newi + 1 'update new index
            Case "-"
                'check if it is comming from a addition
                If typeChrPrev = "+" Then
                    toFrom(2, toFromCount) = newi
                End If
            
                oldi = oldi + 1 'update old index
                If typeChr <> typeChrPrev Then
                    toFromCount = toFromCount + 1
                    ReDim Preserve toFrom(2, toFromCount)
                    'let old be -1
                    toFrom(0, toFromCount) = -1
                    toFrom(1, toFromCount) = oldi
                End If
            Case "+"
                'check if it is comming from a deletion
                If typeChrPrev = "-" Then
                    toFrom(2, toFromCount) = oldi
                End If
            
                newi = newi + 1 'update new index
                If typeChr <> typeChrPrev Then
                    toFromCount = toFromCount + 1
                    ReDim Preserve toFrom(2, toFromCount)
                    'let new be 1
                    toFrom(0, toFromCount) = 1
                    toFrom(1, toFromCount) = newi
                End If
        End Select
    
        i = i + 2
        typeChrPrev = typeChr
    Loop
    
    'check if it ended on a deletion or adition
    If typeChrPrev = "-" Then
        toFrom(2, toFromCount) = oldi
    End If
    
    If typeChrPrev = "+" Then
        toFrom(2, toFromCount) = newi
    End If
    
    passGetDiffOutput = toFrom
End Function

'note, arrays must be single dimension
Public Function longestCommonSubsequenceArr(ByRef array1() As String, ByRef array2() As String) As Long()
    On Error Resume Next
    If UBound(array1, 2) > 0 Or UBound(array2, 2) > 0 Then 'multidimensional arrays
        If Error = vbNullString Then
            Exit Function
        End If
    End If
    
    If UBound(array1) < 0 Or UBound(array2) < 0 Then 'check if arrays are bounded
        If Error <> vbNullString Then
            Exit Function
        End If
    End If

    Dim num() As Long
    
    'define the array, note rows of zeros get added to front automatically
    ReDim num(UBound(array1) + 1, UBound(array2) + 1)
    
    Dim i As Long, j As Long
    
    'note, arrays must always start at indice zero.
    For i = 0 To UBound(array1)
        For j = 0 To UBound(array2)
            If array1(i) = array2(j) Then
                num(i + 1, j + 1) = num(i, j) + 1
            Else
                num(i + 1, j + 1) = max(num(i, j + 1), num(i + 1, j))
            End If
        Next j
    Next i

    longestCommonSubsequenceArr = num
End Function

Public Function stringToArray(ByRef str As String) As String()
    Dim i As Long
    Dim arr() As String
    ReDim arr(Len(str) - 1)
    For i = 1 To Len(str)
        arr(i - 1) = Mid$(str, i, 1)
    Next i
    stringToArray = arr
End Function

Sub exampleArr()

    Dim string1 As String
    Dim string2 As String
    
    string1 = "this is a find the haystack string"
    string2 = "this is a replace the needle string"


    Dim a1() As String
    Dim a2() As String
    
    a1 = stringToArray(string1)
    a2 = stringToArray(string2)
    
    Dim c() As Long
    
    c = longestCommonSubsequenceArr(a1, a2)
    
    Dim str() As String
    
    str = backTraceUpArr(c, a1, a2, UBound(a1), UBound(a2))
    
    Dim dif() As String
    dif = getDiffArr(c, a1, a2, UBound(a1), UBound(a2))

End Sub

'back traces c, defaulting in the up direction
Public Function backTraceUpArr(ByRef c() As Long, ByRef array1() As String, ByRef array2() As String, ByRef i As Long, ByRef j As Long) As String()
    Dim arr() As String
    If i < 0 Or j < 0 Then
        backTraceUpArr = arr
    ElseIf array1(i) = array2(j) Then
        'equal characters, save it and then go up and left
        arr = backTraceUpArr(c, array1, array2, i - 1, j - 1)
        'check the bounding of arr
        Dim bound As Long
        On Error Resume Next
        bound = UBound(arr)
        If Error <> vbNullString Then
            ReDim arr(0)
            arr(0) = array1(i)
        Else 'no error
            ReDim Preserve arr(bound + 1)
            arr(bound + 1) = array1(i)
        End If
        backTraceUpArr = arr
    Else
        'go in the direction of the highest number, defaulting to up
        If (c(i + 1, j) > c(i, j + 1)) Then
            backTraceUpArr = backTraceUpArr(c, array1, array2, i, j - 1)
        Else
            backTraceUpArr = backTraceUpArr(c, array1, array2, i - 1, j)
        End If
    End If
End Function


'returns a 2xn array, where
'indice 0 are equal
'indice 1 are deletions
'indice 2 are additions
Public Function getDiffArr(ByRef c() As Long, ByRef arrayOld() As String, ByRef arrayNew() As String, ByRef i As Long, ByRef j As Long) As String()
    Dim arr() As String
    Dim bound As Long
    On Error Resume Next
    If i >= 0 Then
        If j >= 0 Then 'both are greater or equal to zero
            'can only do the following comparison when i and j are greater or equal than zero
            If arrayOld(i) = arrayNew(j) Then
                    arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j - 1)
                    bound = UBound(arr, 2) 'check the bounding of arr
                    If Error <> vbNullString Then
                        Err.Clear
                        ReDim arr(2, 0)
                        arr(0, 0) = arrayOld(i)
                    Else 'no error
                        ReDim Preserve arr(2, bound + 1)
                        arr(0, bound + 1) = arrayOld(i)
                    End If
                    getDiffArr = arr
            Else
                If i = 0 Then
                    arr = getDiffArr(c, arrayOld, arrayNew, i, j - 1)
                    bound = UBound(arr, 2) 'check the bounding of arr
                    If Error <> vbNullString Then
                        Err.Clear
                        ReDim arr(2, 0)
                        arr(2, 0) = arrayNew(j)
                    Else 'no error
                        ReDim Preserve arr(2, bound + 1)
                        arr(2, bound + 1) = arrayNew(j)
                    End If
                    getDiffArr = arr
                ElseIf c(i + 1, j - 1 + 1) >= c(i - 1 + 1, j + 1) Then
                    arr = getDiffArr(c, arrayOld, arrayNew, i, j - 1)
                    bound = UBound(arr, 2) 'check the bounding of arr
                    If Error <> vbNullString Then
                        Err.Clear
                        ReDim arr(2, 0)
                        arr(2, 0) = arrayNew(j)
                    Else 'no error
                        ReDim Preserve arr(2, bound + 1)
                        arr(2, bound + 1) = arrayNew(j)
                    End If
                    getDiffArr = arr
                ElseIf j = 0 Then
                    arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j)
                    bound = UBound(arr, 2) 'check the bounding of arr
                    If Error <> vbNullString Then
                        Err.Clear
                        ReDim arr(2, 0)
                        arr(1, 0) = arrayOld(i)
                    Else 'no error
                        ReDim Preserve arr(2, bound + 1)
                        arr(1, bound + 1) = arrayOld(i)
                    End If
                    getDiffArr = arr
                ElseIf c(i + 1, j - 1 + 1) < c(i - 1 + 1, j + 1) Then
                    arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j)
                    bound = UBound(arr, 2) 'check the bounding of arr
                    If Error <> vbNullString Then
                        Err.Clear
                        ReDim arr(2, 0)
                        arr(1, 0) = arrayOld(i)
                    Else 'no error
                        ReDim Preserve arr(2, bound + 1)
                        arr(1, bound + 1) = arrayOld(i)
                    End If
                    getDiffArr = arr
                Else
                    getDiffArr = arr
                End If
            End If
        Else 'i is is greater or equal to zero
                If j = 0 Then
                    arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j)
                    bound = UBound(arr, 2) 'check the bounding of arr
                    If Error <> vbNullString Then
                        Err.Clear
                        ReDim arr(2, 0)
                        arr(1, 0) = arrayOld(i)
                    Else 'no error
                        ReDim Preserve arr(2, bound + 1)
                        arr(1, bound + 1) = arrayOld(i)
                    End If
                    getDiffArr = arr
                ElseIf c(i + 1, j - 1 + 1) < c(i - 1 + 1, j + 1) Then
                    arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j)
                    bound = UBound(arr, 2) 'check the bounding of arr
                    If Error <> vbNullString Then
                        Err.Clear
                        ReDim arr(2, 0)
                        arr(1, 0) = arrayOld(i)
                    Else 'no error
                        ReDim Preserve arr(2, bound + 1)
                        arr(1, bound + 1) = arrayOld(i)
                    End If
                    getDiffArr = arr
                Else
                    getDiffArr = arr
                End If
        End If
    Else
        If j >= 0 Then 'j is  greater than zero
                If i = 0 Then
                    arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j)
                    bound = UBound(arr, 2) 'check the bounding of arr
                    If Error <> vbNullString Then
                        Err.Clear
                        ReDim arr(2, 0)
                        arr(2, 0) = arrayNew(j)
                    Else 'no error
                        ReDim Preserve arr(2, bound + 1)
                        arr(2, bound + 1) = arrayNew(j)
                    End If
                    getDiffArr = arr
                ElseIf c(i + 1, j - 1 + 1) >= c(i - 1 + 1, j + 1) Then
                    arr = getDiffArr(c, arrayOld, arrayNew, i, j - 1)
                    bound = UBound(arr, 2) 'check the bounding of arr
                    If Error <> vbNullString Then
                        Err.Clear
                        ReDim arr(2, 0)
                        arr(2, 0) = arrayNew(j)
                    Else 'no error
                        ReDim Preserve arr(2, bound + 1)
                        arr(2, bound + 1) = arrayNew(j)
                    End If
                    getDiffArr = arr
                Else
                    getDiffArr = arr
                End If
        Else 'none are greater than zero
                getDiffArr = arr
        End If
    End If
End Function

Posted

in

by

Tags: