In the latter case, at the very least, demote that to a clip(). But please, please keep in mind that; but with this approach this program will be a lie. Your target means will not be the means of tendency in its current form. There are options to get around this, each requiring a lie of some kind: use a non-normal distribution, which achieves the same mean and standard deviation, or use the clipped normal and add a maximum twice the mean.
- use the clipped normal and add a maximum twice the mean, which achieves the same mean but probably won't have the same standard deviation and won't be exactly normal;
- use a uniform distribution instead with bounds chosen to get your same mean and standard deviation (which is only possible if the difference between your mean and minimum is less than sqrt 3 * your stdev); or
- use a non-normal distribution such as lognorm, which achieves the same mean and standard deviation but will lose the normality you attempted.
SuggestedClipped normals
import numpy as np
import pandas as pd
from numpy.random import default_rng
rand = default_rng(seed=0)
start_date = '2021-01-01'
end_date = '2023-06-28'
warehouse_data = {
"A": {
"avg_rowcount": 10,
"stDev_rowcount": 2,
"avg_CntQty": 17,
"stDev_CntQty": 143,
"avg_SysQty": 17,
"stDev_SysQty": 143
},
"B": {
"avg_rowcount": 10,
"stDev_rowcount": 2,
"avg_CntQty": 27,
"stDev_CntQty": 193,
"avg_SysQty": 27,
"stDev_SysQty": 309
},
"C": {
"avg_rowcount": 3,
"stDev_rowcount": 2,
"avg_CntQty": 50,
"stDev_CntQty": 310,
"avg_SysQty": 51,
"stDev_SysQty": 3090
},
"D": {
"avg_rowcount": 36,
"stDev_rowcount": 99,
"avg_CntQty": 22,
"stDev_CntQty": 31,
"avg_SysQty": 21,
"stDev_SysQty": 31
},
"E": {
"avg_rowcount": 35,
"stDev_rowcount": 120,
"avg_CntQty": 40,
"stDev_CntQty": 116,
"avg_SysQty": 40,
"stDev_SysQty": 116
},
"F": {
"avg_rowcount": 4,
"stDev_rowcount": 2,
"avg_CntQty": 89,
"stDev_CntQty": 3352,
"avg_SysQty": 88,
"stDev_SysQty": 3359
},
"G": {
"avg_rowcount": 2,
"stDev_rowcount": 2,
"avg_CntQty": 599,
"stDev_CntQty": 28430,
"avg_SysQty": 599,
"stDev_SysQty": 28430
}
}
warehouses = pd.DataFrame.from_dict(warehouse_data).T
warehouses.index.name = 'Warehouse_ID'
with_dates = pd.merge(
left=warehouses.reset_index(),
right=pd.date_range(start=start_date, end=end_date, name='Date').to_series(),
how='cross',
)
with_dates['n_rows'] = np.clip(
a=np.round(
rand.normal(
loc=with_dates.avg_rowcount,
scale=with_dates.stDev_rowcount,
),
).astype(int),
a_min=1, a_max=Nonea_max=2*with_dates.avg_rowcount - 1,
)
filled = with_dates.loc[with_dates.index.repeat(with_dates.n_rows)]
output = filled[['Warehouse_ID', 'Date']].copy()
output['CntQty'] = np.clip(
rand.normal(
loc=filled.avg_CntQty,
scale=filled.stDev_CntQty,
size=len(filled),
),
a_min=0, a_max=Nonea_max=2*filled.avg_CntQty,
)
output['SysQty'] = np.clip(
rand.normal(
loc=filled.avg_SysQty,
scale=filled.stDev_SysQty,
size=len(filled),
),
a_min=0, a_max=Nonea_max=2*filled.avg_SysQty,
)
for warehouse_id, group in output.groupby('Warehouse_ID'):
group.to_csv(
path_or_buf=warehouse_id + 'TaskDetails.csv',
index=False,
)
Scaled uniform
Generate your randoms like this:
def uniform_with_std(mean: np.ndarray, std: np.ndarray, min: float) -> np.ndarray:
"""
If the standard deviation is small enough, use a random distribution with bounds that will
produce that standard deviation. Otherwise, ignore it so that we can guarantee nothing
will be returned below the minimum.
"""
offset = np.sqrt(3) * std
low = np.clip(a=mean - offset, a_min=min, a_max=None)
high = 2*mean - low
return rand.uniform(low=low, high=high, size=mean.size)
This ends up "working" only for your n_rows and ignoring your standard deviations for everything else because they're too high.
Lognorm
Generate your randoms via
dist = rand.lognormal(
mean=np.log(u*u / np.sqrt(u*u + s*s)),
sigma=np.sqrt(np.log(1 + s*s/u/u)),
size=...,
)
This is guaranteed to generate non-negative numbers and respect both your mean (u) and standard deviation (s), but is not normally distributed.