Revisions to dataframe replace (numeric) categorical values by their frequency of label = 1

deleted 35 characters in body

Source Link

edited Jun 4, 2022 at 17:05

71.1k
5
76
256

import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    means =return df.groupby(col).label.mean()
    return df[col].replace(means.to_dicttransform()'mean')


for col in ('A', 'B'):
    df[col] = make_counts(col)

import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    means = df.groupby(col).label.mean()
    return df[col].replace(means.to_dict())


for col in ('A', 'B'):
    df[col] = make_counts(col)

import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    return df.groupby(col).label.transform('mean')


for col in ('A', 'B'):
    df[col] = make_counts(col)

added 567 characters in body

Source Link

edited Jun 4, 2022 at 16:47

Reinderien

71.1k
5
76
256

Assuming that "whatever you're actually doing" still uses only 0 or 1 for your labels, you should actually re-interpret this as a grouped mean rather than a grouped count:

import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    means = df.groupby(col).label.mean()
    return df[col].replace(means.to_dict())


for col in ('A', 'B'):
    df[col] = make_counts(col)

Assuming that "whatever you're actually doing" still uses only 0 or 1 for your labels, you should actually re-interpret this as a grouped mean rather than a grouped count:

import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    means = df.groupby(col).label.mean()
    return df[col].replace(means.to_dict())


for col in ('A', 'B'):
    df[col] = make_counts(col)

Source Link

answered Jun 4, 2022 at 14:42

Reinderien

71.1k
5
76
256

Avoid for-loops and avoid unique() in this case. Fundamentally you're doing a grouped count, so use Pandas built-in grouping support which is vectorised. Since your numerator is filtering on label, after grouping you need to join (merge) and fillna on missing values that had no label=1.

Don't construct a dic manually. Once you have a replacement frame with a proper index based on the original values, you can just to_dict().

import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    grouped = df.groupby([col, 'label'])[col].count()
    positive = grouped.loc[:, 1].groupby(level=col).sum().rename('positive')
    count = grouped.groupby(level=col).sum().rename('count_')
    fractions = pd.merge(
        positive, count, how='right', left_index=True, right_index=True,
    )
    replacement = (fractions.positive.fillna(0) / fractions.count_).to_dict()
    return df[col].replace(replacement)


for col in ('A', 'B'):
    df[col] = make_counts(col)

print(df)
'''
          A    B  label
0  0.666667  0.5      0
1  0.000000  0.0      0
2  0.666667  1.0      1
3  0.666667  0.5      1
4  0.000000  0.0      0
'''

Stack Exchange Network

Return to Answer