Skip to main content
deleted 35 characters in body
Source Link
Reinderien
  • 71.1k
  • 5
  • 76
  • 256
import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    means =return df.groupby(col).label.mean()
    return df[col].replace(means.to_dicttransform()'mean')


for col in ('A', 'B'):
    df[col] = make_counts(col)
import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    means = df.groupby(col).label.mean()
    return df[col].replace(means.to_dict())


for col in ('A', 'B'):
    df[col] = make_counts(col)
import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    return df.groupby(col).label.transform('mean')


for col in ('A', 'B'):
    df[col] = make_counts(col)
added 567 characters in body
Source Link
Reinderien
  • 71.1k
  • 5
  • 76
  • 256

Assuming that "whatever you're actually doing" still uses only 0 or 1 for your labels, you should actually re-interpret this as a grouped mean rather than a grouped count:

import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    means = df.groupby(col).label.mean()
    return df[col].replace(means.to_dict())


for col in ('A', 'B'):
    df[col] = make_counts(col)

Assuming that "whatever you're actually doing" still uses only 0 or 1 for your labels, you should actually re-interpret this as a grouped mean rather than a grouped count:

import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    means = df.groupby(col).label.mean()
    return df[col].replace(means.to_dict())


for col in ('A', 'B'):
    df[col] = make_counts(col)
Source Link
Reinderien
  • 71.1k
  • 5
  • 76
  • 256

Avoid for-loops and avoid unique() in this case. Fundamentally you're doing a grouped count, so use Pandas built-in grouping support which is vectorised. Since your numerator is filtering on label, after grouping you need to join (merge) and fillna on missing values that had no label=1.

Don't construct a dic manually. Once you have a replacement frame with a proper index based on the original values, you can just to_dict().

import pandas as pd

df = pd.DataFrame({
    'A':     ('a1', 'a2', 'a1', 'a1', 'a2'),
    'B':     ('b1', 'b3', 'b2', 'b1', 'b3'),
    'label': (   0,    0,    1,    1,    0),
})


def make_counts(col: str) -> pd.Series:
    grouped = df.groupby([col, 'label'])[col].count()
    positive = grouped.loc[:, 1].groupby(level=col).sum().rename('positive')
    count = grouped.groupby(level=col).sum().rename('count_')
    fractions = pd.merge(
        positive, count, how='right', left_index=True, right_index=True,
    )
    replacement = (fractions.positive.fillna(0) / fractions.count_).to_dict()
    return df[col].replace(replacement)


for col in ('A', 'B'):
    df[col] = make_counts(col)

print(df)
'''
          A    B  label
0  0.666667  0.5      0
1  0.000000  0.0      0
2  0.666667  1.0      1
3  0.666667  0.5      1
4  0.000000  0.0      0
'''