3
\$\begingroup\$

Traditionally, VBA developers rely on the FileSystemObject to retrieve files within a directory. I decided to test and compare different methods (Windows API, PowerQuery, PowerShell, and FileSystemObject) to determine which is the most efficient for large-scale file retrieval. Below are the results of my tests, along with the code implementations for each approach.

  1. Windows API: 18.86328 seconds to retrieve 512191 files from D:\
  2. PowerQuery: 27.48828 seconds to retrieve 512191 files from D:\
  3. PowerShell: 74.35938 seconds to retrieve 512191 files from D:\
  4. FileSystemObject: 212.4434 seconds to retrieve 512171 files from D:\

Windows API: 18.86328 seconds

Attribute VB_Name = "WinAPI"
Option Explicit

#If VBA7 Then
    Private Declare PtrSafe Function FindFirstFile Lib "kernel32" Alias "FindFirstFileA" _
        (ByVal lpFileName As String, lpFindFileData As WIN32_FIND_DATA) As LongPtr

    Private Declare PtrSafe Function FindNextFile Lib "kernel32" Alias "FindNextFileA" _
        (ByVal hFindFile As LongPtr, lpFindFileData As WIN32_FIND_DATA) As Long

    Private Declare PtrSafe Function FindClose Lib "kernel32" (ByVal hFindFile As LongPtr) As Long
#Else
    Private Declare Function FindFirstFile Lib "kernel32" Alias "FindFirstFileA" _
        (ByVal lpFileName As String, lpFindFileData As WIN32_FIND_DATA) As Long

    Private Declare Function FindNextFile Lib "kernel32" Alias "FindNextFileA" _
        (ByVal hFindFile As Long, lpFindFileData As WIN32_FIND_DATA) As Long

    Private Declare Function FindClose Lib "kernel32" (ByVal hFindFile As Long) As Long
#End If

Private Type FILETIME
    dwLowDateTime As Long
    dwHighDateTime As Long
End Type

Private Type WIN32_FIND_DATA
    dwFileAttributes As Long
    ftCreationTime As FILETIME
    ftLastAccessTime As FILETIME
    ftLastWriteTime As FILETIME
    nFileSizeHigh As Long
    nFileSizeLow As Long
    dwReserved0 As Long
    dwReserved1 As Long
    cFileName As String * 260
    cAlternate As String * 14
End Type

Function GetFilesWindowsAPIDictionary(RootFolder As String, Optional FileDictionary As Scripting.Dictionary) As Scripting.Dictionary
    Dim hFind As LongPtr
    Dim WFD As WIN32_FIND_DATA
    Dim FileName As String

    ' Initialize the dictionary if not already initialized
    If FileDictionary Is Nothing Then
        Set FileDictionary = New Scripting.Dictionary
    End If

    ' Ensure the folder path ends with a backslash
    If Right(RootFolder, 1) <> "\" Then RootFolder = RootFolder & "\"

    ' Add wildcard to search all files and directories
    hFind = FindFirstFile(RootFolder & "*.*", WFD)

    If hFind <> -1 Then
        Do
            FileName = Left(WFD.cFileName, InStr(WFD.cFileName, vbNullChar) - 1)
            ' Skip the "." and ".." entries
            If FileName <> "." And FileName <> ".." Then
                ' Check if it is a directory
                If (WFD.dwFileAttributes And vbDirectory) = vbDirectory Then
                    ' Recursive call for subdirectories
                    GetFilesWindowsAPIDictionary RootFolder & FileName, FileDictionary
                Else
                    ' Add the file to the dictionary
                    FileDictionary.Add RootFolder & FileName, ""
                End If
            End If
        Loop While FindNextFile(hFind, WFD)
        FindClose hFind
    End If

    ' Return the dictionary of file paths
    Set GetFilesWindowsAPIDictionary = FileDictionary
End Function

PowerQuery: 27.48828 seconds

Attribute VB_Name = "PowerQuery"
Option Explicit

Function GetFilesPowerQuery(RootFolder As String, QueryName As String, TableName As String)
    On Error Resume Next
    ActiveWorkbook.Queries(QueryName).Delete
    On Error GoTo 0
            
    ActiveWorkbook.Queries.Add Name:=QueryName, Formula:= _
        "let" & vbNewLine & _
        "    Source = Folder.Files(" & Chr(34) & RootFolder & Chr(34) & ")," & vbNewLine & _
        "    FullFileName = Table.AddColumn(Source, ""FullFileName"", each [Folder Path] & [Name])," & vbNewLine & _
        "    Result = Table.SelectColumns(FullFileName, {""FullFileName""})" & vbNewLine & _
        "in" & vbNewLine & _
        "    Result"
        
    ActiveWorkbook.Worksheets.Add
    With ActiveSheet.ListObjects.Add(SourceType:=0, Source:= _
        "OLEDB;Provider=Microsoft.Mashup.OleDb.1;Data Source=$Workbook$;Location=" & Chr(34) & QueryName & Chr(34) & ";Extended Properties=""""" _
        , Destination:=Range("$A$1")).QueryTable
        .CommandType = xlCmdSql
        .CommandText = Array("SELECT * FROM [" & QueryName & "]")
        .RowNumbers = False
        .FillAdjacentFormulas = False
        .PreserveFormatting = True
        .RefreshOnFileOpen = False
        .BackgroundQuery = True
        .RefreshStyle = xlInsertDeleteCells
        .SavePassword = False
        .SaveData = True
        .AdjustColumnWidth = True
        .RefreshPeriod = 0
        .PreserveColumnInfo = True
        .ListObject.Name = TableName
        .Refresh BackgroundQuery:=False
    End With
    GetFilesPowerQuery = ActiveSheet.ListObjects(TableName).DataBodyRange.Value
End Function

PowerShell: 74.35938 seconds

Attribute VB_Name = "PowerShell"
Option Explicit

Function GetFilesPowerShell(RootFolder As String) As Variant
    Dim PowerShellScript As String
    Dim PowerShellExec As Object
    Dim Output As String
    Dim Files() As String

    ' PowerShell script to list all files in the specified folder and its subfolders
    PowerShellScript = "Get-ChildItem -Path '" & RootFolder & "' -Recurse -Force -File | Select-Object -ExpandProperty FullName"

    ' Create PowerShell object and execute the script with no window
    Set PowerShellExec = CreateObject("WScript.Shell").Exec("powershell -command "" & { " & PowerShellScript & " }" & " -WindowStyle Hidden")

    ' Capture the output
    Output = PowerShellExec.StdOut.ReadAll

    ' Split the output into an array (each line is a file path)
    Files = Split(Output, vbCrLf)

    ' Return the array of file paths, removing any trailing empty element
    If Len(Files(UBound(Files))) = 0 Then
        ReDim Preserve Files(0 To UBound(Files) - 1)
    End If
    
    GetFilesPowerShell = Files
End Function

FileSystemObject: 212.4434 seconds

Attribute VB_Name = "FSO"
Option Explicit

Function GetFileDictionary(RootFolderPath As String, Optional FileDictionary As Scripting.Dictionary, Optional FSO As Scripting.FileSystemObject) As Scripting.Dictionary
    If FSO Is Nothing Then
        Set FileDictionary = New Scripting.Dictionary
        Set FSO = New Scripting.FileSystemObject
    End If
    
    Dim SubFolder As Folder
    For Each SubFolder In FSO.GetFolder(RootFolderPath).SubFolders
        GetFileDictionary SubFolder.Path, FileDictionary, FSO
    Next
    
    Dim File As File
    For Each File In FSO.GetFolder(RootFolderPath).Files
        FileDictionary.Add File.Path, ""
    Next
    Set GetFileDictionary = FileDictionary
End Function

Test

Attribute VB_Name = "Test"
Option Explicit
Private Const RootFolder As String = "D:\"

Sub TestAll()
    TestGetFilesWindowsAPIDictionary
    TestGetFilesPowerQuery
    TestGetFilesPowerShell
    TestFileSystemObject
End Sub

Sub TestGetFilesWindowsAPIDictionary()
    Dim FileDictionary As Scripting.Dictionary
    Dim StartTime As Single
    Dim EndTime As Single
    Dim ElapsedTime As Single

    ' Start the timer
    StartTime = Timer

    ' Execute the function and retrieve the file paths into the dictionary
    Set FileDictionary = GetFilesWindowsAPIDictionary(RootFolder)

    ' End the timer
    EndTime = Timer

    ' Calculate elapsed time
    ElapsedTime = EndTime - StartTime

    ' Output the elapsed time and number of files retrieved
    If Not FileDictionary Is Nothing Then
        Debug.Print "Windows API: "; ElapsedTime & " seconds to retrieve "; FileDictionary.Count; " files from " & RootFolder
        DictionaryDump FileDictionary
    Else
        Debug.Print "No files found or an error occurred."
    End If
End Sub

Sub TestGetFilesPowerQuery()
    Const QueryName As String = "TestPowerQueryGetFiles"
    Const TableName As String = "TestPowerQueryGetFilesTable"
    Dim StartTime As Single
    Dim EndTime As Single
    Dim ElapsedTime As Single

    ' Start the timer
    StartTime = Timer

    Dim Data
    Data = GetFilesPowerQuery(RootFolder, QueryName, TableName)
    
    ' End the timer
    EndTime = Timer

    ' Calculate elapsed tim
    ElapsedTime = EndTime - StartTime

    ' Output the elapsed time
    Debug.Print "PowerQuery: "; ElapsedTime & " seconds to retrieve "; UBound(Data) - LBound(Data) + 1; " files from " & RootFolder
   
End Sub

Sub TestGetFilesPowerShell()
    Dim Results As Variant
    Dim StartTime As Single
    Dim EndTime As Single
    Dim ElapsedTime As Single

    ' Start the timer
    StartTime = Timer

    ' Execute the function and retrieve the file paths
    Results = GetFilesPowerShell(RootFolder)

    ' End the timer
    EndTime = Timer

    ' Calculate elapsed time
    ElapsedTime = EndTime - StartTime

    ' Optionally, output the number of files retrieved
    If IsArray(Results) Then
        ' Output the elapsed time
        Debug.Print "PowerShell: "; ElapsedTime & " seconds to retrieve "; UBound(Results) - LBound(Results) + 1; " files from " & RootFolder
        ArrayDump Results
    Else
        Debug.Print "No files found or an error occurred."
    End If
End Sub

Sub TestFileSystemObject()
    Dim StartTime As Single
    Dim EndTime As Single
    Dim ElapsedTime As Single
    Dim FileDictionary As Scripting.Dictionary
    ' Start the timer
    StartTime = Timer
    
    ' Execute the function and retrieve the file paths
    Set FileDictionary = GetFileDictionary(RootFolder)

    ' End the timer
    EndTime = Timer

    ' Calculate elapsed time
    ElapsedTime = EndTime - StartTime

    ' Output the elapsed time
    Debug.Print "FileSystemObject: "; ElapsedTime & " seconds to retrieve "; FileDictionary.Count; " files from " & RootFolder
    DictionaryDump FileDictionary
End Sub

Sub ArrayDump(Results)
    Dim Data
    ReDim Data(1 To UBound(Results) - LBound(Results) + 1, 1 To 1)
    Dim n As Long, Count As Long
    For n = LBound(Results) To UBound(Results)
        Count = Count + 1
        Data(Count, 1) = Results(n)
    Next
        
    With Worksheets.Add
        .Range("$A$1").Resize(UBound(Data)).Value = Data
    End With
End Sub

Sub DictionaryDump(FileDictionary As Scripting.Dictionary)
    Dim Data, Keys
    Keys = FileDictionary.Keys
    ReDim Data(1 To FileDictionary.Count, 1 To 1)
    Dim n As Long
    For n = 1 To FileDictionary.Count
        Data(n, 1) = Keys(n - 1)
    Next
        
    With Worksheets.Add
        .Range("$A$1").Resize(FileDictionary.Count).Value = Data
    End With
End Sub

Notes

I'm adamant in my use of Pascal Case with the exception of lower case for counters. Feel free to refute it but I will not change. This is an on-going rant of mine spurred by the fact that Dim value changes Range().Value to Range().value. I also tried unsuccessfully tested shelling Python without a py file.

Questions

  • Why does the FileSystemObject method returns 20 fewer files than the other approaches?
  • Is there a way to get around the DIR() 260-character limit?
  • Can we do a file search using PowerQuery on MACOS?
  • Are there any other candidates or a file search that I should consider?
  • Should I add any tags other than VBA to this post?

Thanks in advance for your feedback.

\$\endgroup\$
9
  • \$\begingroup\$ Hey! I think you'd get better traction asking this on Stack Overflow. It seems that you're more interested in alternative solutions rather than having your code reviewed for what it is. It is definitely odd that FSO returns 20 fewer files. Good luck! \$\endgroup\$ Commented Aug 25, 2024 at 0:14
  • 2
    \$\begingroup\$ @Confettimaker I'll compare the list eventually and figure it out. I post here when there is something that I find interesting that I want to share. Thanks for the +1. \$\endgroup\$ Commented Aug 25, 2024 at 8:15
  • 2
    \$\begingroup\$ @TinMan Could you please try GetFiles from VBA-FileTools? You need the includeSubFolders set to True and maybe set the hidden and system arguments as well. I am curious how long it takes compared to the others in your question. \$\endgroup\$ Commented Aug 27, 2024 at 8:30
  • \$\begingroup\$ I wonder if power query is faster on second refresh with cache. I would like to see something that leverages the windows search index \$\endgroup\$ Commented Aug 27, 2024 at 8:49
  • 2
    \$\begingroup\$ @Greedo It's hard to measure but seems like it was ~4 seconds faster on refresh. \$\endgroup\$ Commented Aug 27, 2024 at 8:53

0

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.