The task is to create sub-tables from an original table (like below) that contain the missing data columns (missing data is highlighted in red). The important part of this task is to make the smallest number of output tables based on the combinations of missing data points across different columns for different rows.
In the above example the optimal output will create 4 tables such as the ones that are shown below. These tables include common columns that are missing across different examples.
This is the code that works to output these tables. An interesting element which this code does not cover and why it can be improved is that the Example20 row is correctly split across Table2 and Table4 as they had been created previously (and IsSubset returned True). However, had Example20 appeared earlier in the data set, a table would have been created for it with the two columns that it has missing ("G" and "I") and then we would have ended up with 5 output tables as we would also have had tables that covered I and G individually which is not the optimal solution. We are looking for an optimised code that finds optimal solution consistently regardless of the order of the rows.
Sub SeparateGroupedMissingDataToNewWorkbookOptimised()
Dim ws As Worksheet
Dim wbNew As Workbook
Dim wsOutput As Worksheet
Dim lastRow As Long, lastCol As Long
Dim currentRow As Long
Dim col As Integer
Dim outputRow As Long
Dim missingKey As String
Dim dict As Object
Dim missingDict As Object
Dim existingKeys As Variant
Dim headers() As String
Dim foundSubset As Boolean
Dim i As Integer, j As Integer
' Define input sheet
Set ws = ThisWorkbook.Sheets("Missing Data")
' Create a new workbook for output
Set wbNew = Workbooks.Add
' Initialise dictionaries to keep track of missing data
Set dict = CreateObject("Scripting.Dictionary")
Set missingDict = CreateObject("Scripting.Dictionary")
' Find the last row and column in the input data
lastRow = ws.Cells(ws.Rows.Count, 1).End(xlUp).Row
lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column
' Loop through each row to identify missing data patterns
For currentRow = 2 To lastRow
missingKey = ""
' Construct a key that represents missing columns
For col = 3 To lastCol ' Start from 3 to skip "Name" and "Code"
If ws.Cells(currentRow, col).Value = "" Then
missingKey = missingKey & ws.Cells(1, col).Value & "|"
End If
Next col
' If there is missing data
If missingKey <> "" Then
foundSubset = False
existingKeys = dict.Keys
' Check all existing patterns for subset matches
For i = 0 To dict.Count - 1
Dim existingKey As String
existingKey = existingKeys(i)
' Check if current missingKey is a subset of existingKey or vice versa
If IsSubset(existingKey, missingKey) Then
foundSubset = True
' Output the row to the existing subset sheet
Set wsOutput = missingDict(existingKey)
outputRow = wsOutput.Cells(wsOutput.Rows.Count, 1).End(xlUp).Row + 1
wsOutput.Cells(outputRow, 1).Value = ws.Cells(currentRow, 1).Value ' Name
wsOutput.Cells(outputRow, 2).Value = ws.Cells(currentRow, 2).Value ' Code
' Fill in the missing columns as blanks
headers = Split(existingKey, "|")
For j = 0 To UBound(headers) - 1
If headers(j) <> "" Then
For col = 3 To lastCol
If wsOutput.Cells(1, col).Value = headers(j) Then
wsOutput.Cells(outputRow, col).Value = "" ' Missing column data
End If
Next col
End If
Next j
End If
Next i
' If no matching subset found, create a new table for this missing data pattern
If Not foundSubset Then
dict.Add missingKey, dict.Count + 1
' Create a new worksheet for each unique pattern of missing data
Set wsOutput = wbNew.Sheets.Add(After:=wbNew.Sheets(wbNew.Sheets.Count))
wsOutput.Name = "Missing_" & dict.Count
' Add headers to the new sheet
wsOutput.Cells(1, 1).Value = "Name"
wsOutput.Cells(1, 2).Value = "Code"
' Add the column names that are missing
headers = Split(missingKey, "|")
For col = 0 To UBound(headers) - 1
If headers(col) <> "" Then
wsOutput.Cells(1, col + 3).Value = headers(col)
End If
Next col
' Initialize missing data dictionary
missingDict.Add missingKey, wsOutput
' Output the current row to the new sheet
outputRow = 2
wsOutput.Cells(outputRow, 1).Value = ws.Cells(currentRow, 1).Value ' Name
wsOutput.Cells(outputRow, 2).Value = ws.Cells(currentRow, 2).Value ' Code
' Fill in the missing columns as blanks
For j = 0 To UBound(headers) - 1
If headers(j) <> "" Then
wsOutput.Cells(outputRow, j + 3).Value = "" ' Missing column data
End If
Next j
End If
End If
Next currentRow
' Adjust column widths for better readability in each output sheet
For Each wsOutput In wbNew.Sheets
wsOutput.Cells.EntireColumn.AutoFit
Next wsOutput
MsgBox "Missing data grouped by missing columns in a new workbook!", vbInformation
End Sub
' Function to check if one key is a subset of another
' key1: A string representing a set of missing column names separated by "|"
' key2: Another string representing a different set of missing column names separated by "|"
' Returns: True if key1 is a subset of key2, False otherwise
Function IsSubset(key1 As String, key2 As String) As Boolean
' Declare arrays to hold the column names extracted from key1 and key2
Dim arr1() As String, arr2() As String
' Loop variables
Dim i As Integer, j As Integer
' Flag to indicate if a match is found
Dim found As Boolean
' Split the keys into arrays of column names using the "|" delimiter
arr1 = Split(key1, "|") ' Array of missing column names from key1
arr2 = Split(key2, "|") ' Array of missing column names from key2
' If key1 has more elements than key2, it cannot be a subset
If UBound(arr1) > UBound(arr2) Then
' key1 should be smaller or equal in length to key2 to be a subset
IsSubset = False
Exit Function
End If
' Loop through each element in arr1 (representing key1)
For i = 0 To UBound(arr1) - 1
' Skip empty elements caused by trailing or leading delimiters
If arr1(i) <> "" Then
found = False ' Reset the found flag for each element in arr1
' Loop through each element in arr2 (representing key2)
For j = 0 To UBound(arr2) - 1
' If the current element in arr1 matches an element in arr2
If arr1(i) = arr2(j) Then
found = True ' Set the found flag to True
Exit For ' No need to continue searching in arr2 for this element
End If
Next j
' If no match is found in arr2 for the current element in arr1
If Not found Then
IsSubset = False ' key1 is not a subset of key2
Exit Function ' Exit the function early
End If
End If
Next i
' If all elements in arr1 are found in arr2, key1 is a subset of key2
IsSubset = True
End Function