Revisions to Generating Test Data with Python

added 1483 characters in body

Source Link

edited Jun 29, 2023 at 22:48

71.1k
5
76
256

In the latter case, at the very least, demote that to a clip(). But please, please keep in mind that; but with this approach this program will be a lie. Your target means will not be the means of tendency in its current form. There are options to get around this, each requiring a lie of some kind: use a non-normal distribution, which achieves the same mean and standard deviation, or use the clipped normal and add a maximum twice the mean.

use the clipped normal and add a maximum twice the mean, which achieves the same mean but probably won't have the same standard deviation and won't be exactly normal;

use a uniform distribution instead with bounds chosen to get your same mean and standard deviation (which is only possible if the difference between your mean and minimum is less than sqrt 3 * your stdev); or

use a non-normal distribution such as lognorm, which achieves the same mean and standard deviation but will lose the normality you attempted.

SuggestedClipped normals

import numpy as np
import pandas as pd

from numpy.random import default_rng


rand = default_rng(seed=0)
start_date = '2021-01-01'
end_date = '2023-06-28'

warehouse_data = {
    "A": {
        "avg_rowcount": 10,
        "stDev_rowcount": 2,
        "avg_CntQty": 17,
        "stDev_CntQty": 143,
        "avg_SysQty": 17,
        "stDev_SysQty": 143
    },
    "B": {
        "avg_rowcount": 10,
        "stDev_rowcount": 2,
        "avg_CntQty": 27,
        "stDev_CntQty": 193,
        "avg_SysQty": 27,
        "stDev_SysQty": 309
    },
    "C": {
        "avg_rowcount": 3,
        "stDev_rowcount": 2,
        "avg_CntQty": 50,
        "stDev_CntQty": 310,
        "avg_SysQty": 51,
        "stDev_SysQty": 3090
    },
    "D": {
        "avg_rowcount": 36,
        "stDev_rowcount": 99,
        "avg_CntQty": 22,
        "stDev_CntQty": 31,
        "avg_SysQty": 21,
        "stDev_SysQty": 31
    },
    "E": {
        "avg_rowcount": 35,
        "stDev_rowcount": 120,
        "avg_CntQty": 40,
        "stDev_CntQty": 116,
        "avg_SysQty": 40,
        "stDev_SysQty": 116
    },
    "F": {
        "avg_rowcount": 4,
        "stDev_rowcount": 2,
        "avg_CntQty": 89,
        "stDev_CntQty": 3352,
        "avg_SysQty": 88,
        "stDev_SysQty": 3359
    },
    "G": {
        "avg_rowcount": 2,
        "stDev_rowcount": 2,
        "avg_CntQty": 599,
        "stDev_CntQty": 28430,
        "avg_SysQty": 599,
        "stDev_SysQty": 28430
    }
}
warehouses = pd.DataFrame.from_dict(warehouse_data).T
warehouses.index.name = 'Warehouse_ID'

with_dates = pd.merge(
    left=warehouses.reset_index(),
    right=pd.date_range(start=start_date, end=end_date, name='Date').to_series(),
    how='cross',
)

with_dates['n_rows'] = np.clip(
    a=np.round(
        rand.normal(
            loc=with_dates.avg_rowcount,
            scale=with_dates.stDev_rowcount,
        ),
    ).astype(int),
    a_min=1, a_max=Nonea_max=2*with_dates.avg_rowcount - 1,
)

filled = with_dates.loc[with_dates.index.repeat(with_dates.n_rows)]

output = filled[['Warehouse_ID', 'Date']].copy()
output['CntQty'] = np.clip(
    rand.normal(
        loc=filled.avg_CntQty,
        scale=filled.stDev_CntQty,
        size=len(filled),
    ),
    a_min=0, a_max=Nonea_max=2*filled.avg_CntQty,
)
output['SysQty'] = np.clip(
    rand.normal(
        loc=filled.avg_SysQty,
        scale=filled.stDev_SysQty,
        size=len(filled),
    ),
    a_min=0, a_max=Nonea_max=2*filled.avg_SysQty,
)

for warehouse_id, group in output.groupby('Warehouse_ID'):
    group.to_csv(
        path_or_buf=warehouse_id + 'TaskDetails.csv',
        index=False,
    )

Scaled uniform

Generate your randoms like this:

def uniform_with_std(mean: np.ndarray, std: np.ndarray, min: float) -> np.ndarray:
    """
    If the standard deviation is small enough, use a random distribution with bounds that will
    produce that standard deviation. Otherwise, ignore it so that we can guarantee nothing
    will be returned below the minimum.
    """
    offset = np.sqrt(3) * std
    low = np.clip(a=mean - offset, a_min=min, a_max=None)
    high = 2*mean - low
    return rand.uniform(low=low, high=high, size=mean.size)

This ends up "working" only for your n_rows and ignoring your standard deviations for everything else because they're too high.

Lognorm

Generate your randoms via

dist = rand.lognormal(
    mean=np.log(u*u / np.sqrt(u*u + s*s)),
    sigma=np.sqrt(np.log(1 + s*s/u/u)),
    size=...,
)

This is guaranteed to generate non-negative numbers and respect both your mean (u) and standard deviation (s), but is not normally distributed.

In the latter case, at the very least, demote that to a clip(). But please, please keep in mind that this program will be a lie. Your target means will not be the means of tendency in its current form. There are options to get around this, each requiring a lie of some kind: use a non-normal distribution, which achieves the same mean and standard deviation, or use the clipped normal and add a maximum twice the mean.

Suggested

import numpy as np
import pandas as pd

from numpy.random import default_rng


rand = default_rng(seed=0)
start_date = '2021-01-01'
end_date = '2023-06-28'

warehouse_data = {
    "A": {
        "avg_rowcount": 10,
        "stDev_rowcount": 2,
        "avg_CntQty": 17,
        "stDev_CntQty": 143,
        "avg_SysQty": 17,
        "stDev_SysQty": 143
    },
    "B": {
        "avg_rowcount": 10,
        "stDev_rowcount": 2,
        "avg_CntQty": 27,
        "stDev_CntQty": 193,
        "avg_SysQty": 27,
        "stDev_SysQty": 309
    },
    "C": {
        "avg_rowcount": 3,
        "stDev_rowcount": 2,
        "avg_CntQty": 50,
        "stDev_CntQty": 310,
        "avg_SysQty": 51,
        "stDev_SysQty": 3090
    },
    "D": {
        "avg_rowcount": 36,
        "stDev_rowcount": 99,
        "avg_CntQty": 22,
        "stDev_CntQty": 31,
        "avg_SysQty": 21,
        "stDev_SysQty": 31
    },
    "E": {
        "avg_rowcount": 35,
        "stDev_rowcount": 120,
        "avg_CntQty": 40,
        "stDev_CntQty": 116,
        "avg_SysQty": 40,
        "stDev_SysQty": 116
    },
    "F": {
        "avg_rowcount": 4,
        "stDev_rowcount": 2,
        "avg_CntQty": 89,
        "stDev_CntQty": 3352,
        "avg_SysQty": 88,
        "stDev_SysQty": 3359
    },
    "G": {
        "avg_rowcount": 2,
        "stDev_rowcount": 2,
        "avg_CntQty": 599,
        "stDev_CntQty": 28430,
        "avg_SysQty": 599,
        "stDev_SysQty": 28430
    }
}
warehouses = pd.DataFrame.from_dict(warehouse_data).T
warehouses.index.name = 'Warehouse_ID'

with_dates = pd.merge(
    left=warehouses.reset_index(),
    right=pd.date_range(start=start_date, end=end_date, name='Date').to_series(),
    how='cross',
)

with_dates['n_rows'] = np.clip(
    a=np.round(
        rand.normal(
            loc=with_dates.avg_rowcount,
            scale=with_dates.stDev_rowcount,
        ),
    ).astype(int),
    a_min=1, a_max=None,
)

filled = with_dates.loc[with_dates.index.repeat(with_dates.n_rows)]

output = filled[['Warehouse_ID', 'Date']].copy()
output['CntQty'] = np.clip(
    rand.normal(
        loc=filled.avg_CntQty,
        scale=filled.stDev_CntQty,
        size=len(filled),
    ),
    a_min=0, a_max=None,
)
output['SysQty'] = np.clip(
    rand.normal(
        loc=filled.avg_SysQty,
        scale=filled.stDev_SysQty,
        size=len(filled),
    ),
    a_min=0, a_max=None,
)

for warehouse_id, group in output.groupby('Warehouse_ID'):
    group.to_csv(
        path_or_buf=warehouse_id + 'TaskDetails.csv',
        index=False,
    )

In the latter case, at the very least, demote that to a clip(); but with this approach this program will be a lie. Your target means will not be the means of tendency in its current form. There are options to get around this, each requiring a lie of some kind:

use the clipped normal and add a maximum twice the mean, which achieves the same mean but probably won't have the same standard deviation and won't be exactly normal;

use a uniform distribution instead with bounds chosen to get your same mean and standard deviation (which is only possible if the difference between your mean and minimum is less than sqrt 3 * your stdev); or

use a non-normal distribution such as lognorm, which achieves the same mean and standard deviation but will lose the normality you attempted.

Clipped normals

import numpy as np
import pandas as pd

from numpy.random import default_rng


rand = default_rng(seed=0)
start_date = '2021-01-01'
end_date = '2023-06-28'

warehouse_data = {
    "A": {
        "avg_rowcount": 10,
        "stDev_rowcount": 2,
        "avg_CntQty": 17,
        "stDev_CntQty": 143,
        "avg_SysQty": 17,
        "stDev_SysQty": 143
    },
    "B": {
        "avg_rowcount": 10,
        "stDev_rowcount": 2,
        "avg_CntQty": 27,
        "stDev_CntQty": 193,
        "avg_SysQty": 27,
        "stDev_SysQty": 309
    },
    "C": {
        "avg_rowcount": 3,
        "stDev_rowcount": 2,
        "avg_CntQty": 50,
        "stDev_CntQty": 310,
        "avg_SysQty": 51,
        "stDev_SysQty": 3090
    },
    "D": {
        "avg_rowcount": 36,
        "stDev_rowcount": 99,
        "avg_CntQty": 22,
        "stDev_CntQty": 31,
        "avg_SysQty": 21,
        "stDev_SysQty": 31
    },
    "E": {
        "avg_rowcount": 35,
        "stDev_rowcount": 120,
        "avg_CntQty": 40,
        "stDev_CntQty": 116,
        "avg_SysQty": 40,
        "stDev_SysQty": 116
    },
    "F": {
        "avg_rowcount": 4,
        "stDev_rowcount": 2,
        "avg_CntQty": 89,
        "stDev_CntQty": 3352,
        "avg_SysQty": 88,
        "stDev_SysQty": 3359
    },
    "G": {
        "avg_rowcount": 2,
        "stDev_rowcount": 2,
        "avg_CntQty": 599,
        "stDev_CntQty": 28430,
        "avg_SysQty": 599,
        "stDev_SysQty": 28430
    }
}
warehouses = pd.DataFrame.from_dict(warehouse_data).T
warehouses.index.name = 'Warehouse_ID'

with_dates = pd.merge(
    left=warehouses.reset_index(),
    right=pd.date_range(start=start_date, end=end_date, name='Date').to_series(),
    how='cross',
)

with_dates['n_rows'] = np.clip(
    a=np.round(
        rand.normal(
            loc=with_dates.avg_rowcount,
            scale=with_dates.stDev_rowcount,
        ),
    ).astype(int),
    a_min=1, a_max=2*with_dates.avg_rowcount - 1,
)

filled = with_dates.loc[with_dates.index.repeat(with_dates.n_rows)]

output = filled[['Warehouse_ID', 'Date']].copy()
output['CntQty'] = np.clip(
    rand.normal(
        loc=filled.avg_CntQty,
        scale=filled.stDev_CntQty,
        size=len(filled),
    ),
    a_min=0, a_max=2*filled.avg_CntQty,
)
output['SysQty'] = np.clip(
    rand.normal(
        loc=filled.avg_SysQty,
        scale=filled.stDev_SysQty,
        size=len(filled),
    ),
    a_min=0, a_max=2*filled.avg_SysQty,
)

for warehouse_id, group in output.groupby('Warehouse_ID'):
    group.to_csv(
        path_or_buf=warehouse_id + 'TaskDetails.csv',
        index=False,
    )

Scaled uniform

Generate your randoms like this:

def uniform_with_std(mean: np.ndarray, std: np.ndarray, min: float) -> np.ndarray:
    """
    If the standard deviation is small enough, use a random distribution with bounds that will
    produce that standard deviation. Otherwise, ignore it so that we can guarantee nothing
    will be returned below the minimum.
    """
    offset = np.sqrt(3) * std
    low = np.clip(a=mean - offset, a_min=min, a_max=None)
    high = 2*mean - low
    return rand.uniform(low=low, high=high, size=mean.size)

This ends up "working" only for your n_rows and ignoring your standard deviations for everything else because they're too high.

Lognorm

Generate your randoms via

dist = rand.lognormal(
    mean=np.log(u*u / np.sqrt(u*u + s*s)),
    sigma=np.sqrt(np.log(1 + s*s/u/u)),
    size=...,
)

This is guaranteed to generate non-negative numbers and respect both your mean (u) and standard deviation (s), but is not normally distributed.

added 238 characters in body

Source Link

edited Jun 29, 2023 at 14:50

Reinderien

71.1k
5
76
256

In the latter case, at the very least, demote that to a clip(). But please, please keep in mind that this program will be a lie. Your target means will not be the means of tendency in its current form. There are options to get around this, each requiring a lie of some kind: use a non-normal distribution, which achieves the same mean and standard deviation, or use the clipped normal and add a maximum twice the mean.

In the latter case, at the very least, demote that to a clip(). But please, please keep in mind that this program will be a lie. Your target means will not be the means of tendency.

In the latter case, at the very least, demote that to a clip(). But please, please keep in mind that this program will be a lie. Your target means will not be the means of tendency in its current form. There are options to get around this, each requiring a lie of some kind: use a non-normal distribution, which achieves the same mean and standard deviation, or use the clipped normal and add a maximum twice the mean.

Source Link

answered Jun 29, 2023 at 13:58

Reinderien

71.1k
5
76
256

Forgive my saying so, but

Clipping a normal distribution to have a minimum of 0 introduces significant statistical bias and doesn't make much sense;
ceil() also introduces bias; and
Applying an abs() makes far less sense and will produce a bizarre distribution indeed.

In the latter case, at the very least, demote that to a clip(). But please, please keep in mind that this program will be a lie. Your target means will not be the means of tendency.

Delete all of your code and replace it with a vectorised version that uses Numpy and Pandas. Don't iterate over each row.

When generating test data for most applications, it's very important that you set a constant (or at least explicitly-determined) random seed.

Suggested

import numpy as np
import pandas as pd

from numpy.random import default_rng


rand = default_rng(seed=0)
start_date = '2021-01-01'
end_date = '2023-06-28'

warehouse_data = {
    "A": {
        "avg_rowcount": 10,
        "stDev_rowcount": 2,
        "avg_CntQty": 17,
        "stDev_CntQty": 143,
        "avg_SysQty": 17,
        "stDev_SysQty": 143
    },
    "B": {
        "avg_rowcount": 10,
        "stDev_rowcount": 2,
        "avg_CntQty": 27,
        "stDev_CntQty": 193,
        "avg_SysQty": 27,
        "stDev_SysQty": 309
    },
    "C": {
        "avg_rowcount": 3,
        "stDev_rowcount": 2,
        "avg_CntQty": 50,
        "stDev_CntQty": 310,
        "avg_SysQty": 51,
        "stDev_SysQty": 3090
    },
    "D": {
        "avg_rowcount": 36,
        "stDev_rowcount": 99,
        "avg_CntQty": 22,
        "stDev_CntQty": 31,
        "avg_SysQty": 21,
        "stDev_SysQty": 31
    },
    "E": {
        "avg_rowcount": 35,
        "stDev_rowcount": 120,
        "avg_CntQty": 40,
        "stDev_CntQty": 116,
        "avg_SysQty": 40,
        "stDev_SysQty": 116
    },
    "F": {
        "avg_rowcount": 4,
        "stDev_rowcount": 2,
        "avg_CntQty": 89,
        "stDev_CntQty": 3352,
        "avg_SysQty": 88,
        "stDev_SysQty": 3359
    },
    "G": {
        "avg_rowcount": 2,
        "stDev_rowcount": 2,
        "avg_CntQty": 599,
        "stDev_CntQty": 28430,
        "avg_SysQty": 599,
        "stDev_SysQty": 28430
    }
}
warehouses = pd.DataFrame.from_dict(warehouse_data).T
warehouses.index.name = 'Warehouse_ID'

with_dates = pd.merge(
    left=warehouses.reset_index(),
    right=pd.date_range(start=start_date, end=end_date, name='Date').to_series(),
    how='cross',
)

with_dates['n_rows'] = np.clip(
    a=np.round(
        rand.normal(
            loc=with_dates.avg_rowcount,
            scale=with_dates.stDev_rowcount,
        ),
    ).astype(int),
    a_min=1, a_max=None,
)

filled = with_dates.loc[with_dates.index.repeat(with_dates.n_rows)]

output = filled[['Warehouse_ID', 'Date']].copy()
output['CntQty'] = np.clip(
    rand.normal(
        loc=filled.avg_CntQty,
        scale=filled.stDev_CntQty,
        size=len(filled),
    ),
    a_min=0, a_max=None,
)
output['SysQty'] = np.clip(
    rand.normal(
        loc=filled.avg_SysQty,
        scale=filled.stDev_SysQty,
        size=len(filled),
    ),
    a_min=0, a_max=None,
)

for warehouse_id, group in output.groupby('Warehouse_ID'):
    group.to_csv(
        path_or_buf=warehouse_id + 'TaskDetails.csv',
        index=False,
    )

Stack Exchange Network

Return to Answer

SuggestedClipped normals

Scaled uniform

Lognorm

Suggested

Clipped normals

Scaled uniform

Lognorm

Suggested