I'm trying to implement a homogeneous multithreading example that multiple threads process portion of a huge task. In order to achieve this, I thought of clustering data/resource and multiple threads handle/process each clustered data/resource. In the meantime, I wanted to maintain singleton like structure that can manage all clustered data from outside as if they are not clustered.
Code example of design is as follows,
Sorry about using interlocked family of functions if it bothers you.
struct SResource
{
    unsigned char ucA;
    unsigned long long ullB;
    unsigned char pBuffer[1024];
};
class CResourceManager
{
private:
    static CResourceManager s_pManagers[16];
    long long m_llLock;
    std::map<unsigned long long, SResource*> m_mapResources;
    CResourceManager(){}
    ~CResourceManager(){}
public:
    enum class EResult : unsigned long long
    {
        None = 0ULL,
        Success = 1ULL,
        Fail_ReachedMaximumTrial = 2ULL,
        Fail_ArgumentNull = 3ULL,
        Fail_ArgumentInvalid = 4ULL
    };
    static const EResult Do(const unsigned long long _ullIndex, const unsigned long long _ullTrialCount = 65535ULL)
    {
        if (_ullIndex < 0LL)
        {
            return EResult::Fail_ArgumentInvalid;
        }
        if (_ullIndex >= 16LL)
        {
            return EResult::Fail_ArgumentInvalid;
        }
        for (unsigned long long i = 0ULL; i < _ullTrialCount; ++i)
        {
            if (InterlockedCompareExchange64(&s_pManagers[_ullIndex].m_llLock, 1LL, 0LL) == 0LL)
            {
                std::map<unsigned long long, SResource*>::iterator iterEnd = s_pManagers[_ullIndex].m_mapResources.end();
                for (std::map<unsigned long long, SResource*>::iterator iter = s_pManagers[_ullIndex].m_mapResources.begin(); iter != iterEnd; ++iter)
                {
                    if (iter->second != nullptr)
                    {
                        // Do something with each resource
                    }
                }
                InterlockedExchange64(&s_pManagers[_ullIndex].m_llLock, 0LL);
                return EResult::Success;
            }
        }
        return EResult::Fail_ReachedMaximumTrial;
    }
    static const EResult GetClusterIndex(unsigned long long _ullResourceID, unsigned long long* _pIndex, const unsigned long long _ullTrialCount = 65535ULL)
    {
        if (_pIndex == nullptr)
        {
            return EResult::Fail_ArgumentNull;
        }
        for (unsigned long long i = 0ULL; i < 16; ++i)
        {
            for (unsigned long long j = 0ULL; j < _ullTrialCount; ++j)
            {
                if (_InterlockedCompareExchange64(&s_pManagers[i].m_llLock, 1LL, 0LL) == 0LL)
                {
                    std::map<unsigned long long, SResource*>::iterator iter = s_pManagers[i].m_mapResources.find(_ullResourceID);
                    if (iter != s_pManagers[i].m_mapResources.end())
                    {
                        (*_pIndex) = i;
                        _InterlockedExchange64(&s_pManagers[i].m_llLock, 0LL);
                        return EResult::Success;
                    }
                    _InterlockedExchange64(&s_pManagers[i].m_llLock, 0LL);
                    break;
                }
            }
        }
        return EResult::Fail_ReachedMaximumTrial;
    }
}
The Do function pointer will be inserted to my threadpool directly or by wrapping it by lambda.
I wonder if my concept of clustering data per thread is bad idea?
Also, expression like s_pManagers[_ullIndex].m_llLock is safe?
My first design was readers-writer-lock for s_pManagers, but tiny lock bothered me so much.
