I have been working on this thread pool for awhile to make it as simple to use as possible. I need some tips on improving performance, and some good ways to test its performance. I was wondering if anyone had any opinions/suggestions!
Here is the class:
#pragma once
#include<thread>
#include<vector>
#include<queue>
#include<mutex>
#include<condition_variable>
#include<functional>
#include<future>
#define MAX_THREADS std::thread::hardware_concurrency() - 1;
//portable way to null the copy and assignment operators
#define NULL_COPY_AND_ASSIGN(T) \
    T(const T& other) {(void)other;} \
    void operator=(const T& other) { (void)other; }
/* ThreadPool class
It is a singleton. To prevent spawning
tons of threads, I made it a singleton */
class ThreadPool{
public:
    //getInstance to allow the second constructor to be called
    static ThreadPool& getInstance(int numThreads){
        static ThreadPool instance(numThreads);
        return instance;
    }
    //add any arg # function to queue
    template <typename Func, typename... Args >
    inline auto push(Func&& f, Args&&... args){
        //get return type of the function
        typedef decltype(f(args...)) retType;
        //package the task
        std::packaged_task<retType()> task(std::move(std::bind(f, args...)));
        // lock jobqueue mutex, add job to the job queue 
        std::unique_lock<std::mutex> lock(JobMutex);
        
        //get the future from the task before the task is moved into the jobqueue
        std::future<retType> future = task.get_future();
        //place the job into the queue
        JobQueue.emplace( std::make_shared<AnyJob<retType> > (std::move(task)) );
        //notify a thread that there is a new job
        thread.notify_one();
        //return the future for the function so the user can get the return value
        return future;
    }
    /* utility functions will go here*/
    inline void resize(int newTCount){
        
        int tmp = MAX_THREADS;
        if(newTCount > tmp || newTCount < 1){
            tmp = numThreads;
            numThreads = MAX_THREADS;
            Pool.resize(newTCount);
            for (int i = tmp; i != numThreads; ++i) {
                Pool.emplace_back(std::thread(&ThreadPool::threadManager, this));
                Pool.back().detach();
            }
        }
        else if (newTCount > numThreads) {
            uint8_t tmp = numThreads;
            numThreads = newTCount;
            Pool.resize(numThreads);
            for (int i = tmp; i != numThreads; ++i) {
                Pool.emplace_back(std::thread(&ThreadPool::threadManager, this));
                Pool.back().detach();
            }
        }
        else {
            numThreads = (uint8_t)newTCount;
            Pool.resize(newTCount);
        }
        
    }
    inline uint8_t getThreadCount(){
        return numThreads;
    }
private:
    //used polymorphism to store any type of function in the job queue
    class Job {
    private:
        std::packaged_task<void()> func;
    public:
        virtual ~Job() {}
        virtual void execute() = 0;
    };
    template <typename RetType>
    class AnyJob : public Job {
    private:
        std::packaged_task<RetType()> func;
    public:
        AnyJob(std::packaged_task<RetType()> func) : func(std::move(func)) {}
        void execute() {
            func();
        }
    }; 
    // end member classes
    //member variables
    uint8_t numThreads; // number of threads in the pool
    std::vector<std::thread> Pool; //the actual thread pool
    std::queue<std::shared_ptr<Job>> JobQueue;
    std::condition_variable thread;// used to notify threads about available jobs
    std::mutex JobMutex; // used to push/pop jobs to/from the queue
    //end member variables
    /* infinite loop function */
    inline void threadManager() {
        while (true) {
            std::unique_lock<std::mutex> lock(JobMutex);
            thread.wait(lock, [this] {return !JobQueue.empty(); });
            //strange bug where it will continue even if the job queue is empty
            if (JobQueue.size() < 1)
                continue;
            (*JobQueue.front()).execute();
            JobQueue.pop();
        }
    }
    /*  Constructors */
    ThreadPool(); //prevent default constructor from being called
    //real constructor that is used
    inline ThreadPool(uint8_t numThreads) : numThreads(numThreads) {
        int tmp = MAX_THREADS;
        if(numThreads > tmp){
            numThreads = tmp;
        }
        Pool.reserve(numThreads);
        for(int i = 0; i != numThreads; ++i){
            Pool.emplace_back(std::thread(&ThreadPool::threadManager, this));
            Pool.back().detach();
        }
    }
    /* end constructors */
NULL_COPY_AND_ASSIGN(ThreadPool);
}; /* end ThreadPool Class */
Here is example usage:
#include "ThreadPool.h"
#include <iostream>
int main(){
    ThreadPool& pool = ThreadPool::getInstance(4); //create pool with 4 threads
    auto testFunc = [](int x){ return x*x; };
    auto returnValue = pool.push(testFunc, 5);
    std::cout << returnValue.get() << std::endl;
    return 0;
}
EDIT: Here is the current revised code! I still need to clean up the naming convention, and I am considering making it a normal class instead of a singleton. I definitely need help creating a destructor to gracefully shut down these threads, as detach() is seen as bad practice.
There no longer a limit of MAX_THREADS and is not only limited by the size of a uint8_t. I removed threadManager and it is now a lambda inside of the constructor. I also removed all my inline specifiers.
Here is the code:
#pragma once
#include<thread>
#include<vector>
#include<queue>
#include<mutex>
#include<condition_variable>
#include<functional>
#include<future>
//updated C++11 and on way to null copy and assign
//I like to keep it as a macro for big projects so I can just call it on any class
#define NULL_COPY_AND_ASSIGN(T) \
    T(const T& other) = delete; \
    void operator=(const T& other) = delete;
/* ThreadPool class
Still a singleton while I consider removing it */
class ThreadPool{
public:
    //getInstance to allow the second constructor to be called
    static ThreadPool& getInstance(int numThreads){
        static ThreadPool instance(numThreads);
        return instance;
    }
    //add any arg # function to queue
    template <typename Func, typename... Args >
    auto push(Func&& f, Args&&... args){
        //get return type of the function
        typedef decltype(f(args...)) retType;
        //package the task
        std::packaged_task<retType()> task(std::move(std::bind(f, args...)));
        
        //get the future from the task before the task is moved into the jobqueue
        std::future<retType> future = task.get_future();
        //create a new scope so the lock will unlock before notify
        {
            // lock jobqueue mutex, add job to the job queue 
            std::unique_lock<std::mutex> lock(JobMutex);
            //place the job into the queue
            JobQueue.emplace(std::make_shared<AnyJob<retType> >(std::move(task)));
        }
        //notify a thread that there is a new job
        thread.notify_one();
        //return the future for the function so the user can get the return value
        return future;
    }
    /* utility functions will go here*/
   
    uint8_t getThreadCount(){
        return Pool.size();
    }
private:
    class Job {
    public:
        virtual void execute() = 0;
    };
    template <typename RetType>
    class AnyJob : public Job {
    private:
        std::packaged_task<RetType()> func;
    public:
        AnyJob(std::packaged_task<RetType()> func) : func(std::move(func)) {}
        void execute() {
            func();
        }
    };
    std::vector<std::thread> Pool; //the actual thread pool
    std::queue<std::shared_ptr<Job>> JobQueue;
    std::condition_variable thread;// used to notify threads about available jobs
    std::mutex JobMutex; // used to push/pop jobs to/from the queue
    /*  Constructors */
    //real constructor that is used
    ThreadPool(uint8_t numThreads) {
        
        auto threadFunc = [this]() {
            while (true) {
                std::shared_ptr<Job> job;
                //create a new scope so the unique lock will unlock when its no longer needed
                //profiling revealed that this function was holding the lock while executing the function, prevent other jobs from running!
                {
                    std::unique_lock<std::mutex> lock(JobMutex);
                    thread.wait(lock, [this] {return !JobQueue.empty(); });
                    //strange bug where it will continue even if the job queue is empty
                    if (JobQueue.size() < 1)
                        continue;
                    job = JobQueue.front();
                    JobQueue.pop();
                } 
                (*job).execute();
            }
        
        };
        Pool.reserve(numThreads);
        for(int i = 0; i != numThreads; ++i){
            Pool.emplace_back(std::thread(threadFunc));
            Pool.back().detach();
            //need help here, I could not get this to compile without using detach()
        }
    }
    /* end constructors */
NULL_COPY_AND_ASSIGN(ThreadPool);
}; /* end ThreadPool Class */
Thank you all for the great feedback so far! Ill still be making edits based on future comments/ideas/additions!