Return to Question

Add full code for repro as well as expected result

Source Link

edited Jul 3, 2023 at 17:39

import pandas as pd

df = pd.DataFrame({
    "alcohol_by_volume": [],
    "barcode": [],
    "calcium_per_hundred": [],
    "calcium_unit": [],
    "carbohydrates_per_hundred": [],
    "carbohydrates_per_portion": [],
    "carbohydrates_unit": [],
    "cholesterol_per_hundred": [],
    "cholesterol_unit": [],
    "copper_cu_per_hundred": [],
    "copper_cu_unit": [],
    "country": [],
    "created_at": [],
    "energy_kcal_per_hundred": [],
    "energy_kcal_per_portion": [],
    "energy_kcal_unit": [],
    "energy_per_hundred": [],
    "energy_per_portion": [],
    "energy_unit": [],
    "fat_per_hundred": [],
    "fat_per_portion": [],
    "fat_unit": [],
    "fatty_acids_total_saturated_per_hundred": [],
    "fatty_acids_total_saturated_unit": [],
    "fatty_acids_total_trans_per_hundred": [],
    "fatty_acids_total_trans_unit": [],
    "fiber_insoluble_per_hundred": [],
    "fiber_insoluble_unit": [],
    "fiber_per_hundred": [],
    "fiber_per_portion": [],
    "fiber_soluble_per_hundred": [],
    "fiber_soluble_unit": [],
    "fiber_unit": [],
    "folate_total_per_hundred": [],
    "folate_total_unit": [],
    "folic_acid_per_hundred": [],
    "folic_acid_unit": [],
    "hundred_unit": [],
    "id": [],
    "ingredients_en": [],
    "iron_per_hundred": [],
    "iron_unit": [],
    "magnesium_per_hundred": [],
    "magnesium_unit": [],
    "manganese_mn_per_hundred": []
})

colnames_all = df.columns.to_list()
colnames_unit = [n for n in colnames_all if n.endswith("_unit")]
colnames_per_hundred = [n for n in colnames_all if n.endswith("_per_hundred")]
colnames_per_portion = [n for n in colnames_all if n.endswith("_per_portion")]
colnames_other = list(
    set(colnames_all) - set(colnames_unit + colnames_per_hundred + colnames_per_portion)
)

Expected result (2 examples, other 2 lists are similar to 1st one):

colnames_unit:

['calcium_unit',
 'carbohydrates_unit',
 'cholesterol_unit',
 'copper_cu_unit',
 'energy_kcal_unit',
 'energy_unit',
 'fat_unit',
 'fatty_acids_total_saturated_unit',
 'fatty_acids_total_trans_unit',
 'fiber_insoluble_unit',
 'fiber_soluble_unit',
 'fiber_unit',
 'folate_total_unit',
 'folic_acid_unit',
 'hundred_unit',
 'iron_unit',
 'magnesium_unit']

 colnames_other:

 ['ingredients_en',
 'country',
 'id',
 'created_at',
 'barcode',
 'alcohol_by_volume']

colnames_all = df.columns.to_list()
colnames_unit = [n for n in colnames_all if n.endswith("_unit")]
colnames_per_hundred = [n for n in colnames_all if n.endswith("_per_hundred")]
colnames_per_portion = [n for n in colnames_all if n.endswith("_per_portion")]
colnames_other = list(
    set(colnames_all) - set(colnames_unit + colnames_per_hundred + colnames_per_portion)
)

import pandas as pd

df = pd.DataFrame({
    "alcohol_by_volume": [],
    "barcode": [],
    "calcium_per_hundred": [],
    "calcium_unit": [],
    "carbohydrates_per_hundred": [],
    "carbohydrates_per_portion": [],
    "carbohydrates_unit": [],
    "cholesterol_per_hundred": [],
    "cholesterol_unit": [],
    "copper_cu_per_hundred": [],
    "copper_cu_unit": [],
    "country": [],
    "created_at": [],
    "energy_kcal_per_hundred": [],
    "energy_kcal_per_portion": [],
    "energy_kcal_unit": [],
    "energy_per_hundred": [],
    "energy_per_portion": [],
    "energy_unit": [],
    "fat_per_hundred": [],
    "fat_per_portion": [],
    "fat_unit": [],
    "fatty_acids_total_saturated_per_hundred": [],
    "fatty_acids_total_saturated_unit": [],
    "fatty_acids_total_trans_per_hundred": [],
    "fatty_acids_total_trans_unit": [],
    "fiber_insoluble_per_hundred": [],
    "fiber_insoluble_unit": [],
    "fiber_per_hundred": [],
    "fiber_per_portion": [],
    "fiber_soluble_per_hundred": [],
    "fiber_soluble_unit": [],
    "fiber_unit": [],
    "folate_total_per_hundred": [],
    "folate_total_unit": [],
    "folic_acid_per_hundred": [],
    "folic_acid_unit": [],
    "hundred_unit": [],
    "id": [],
    "ingredients_en": [],
    "iron_per_hundred": [],
    "iron_unit": [],
    "magnesium_per_hundred": [],
    "magnesium_unit": [],
    "manganese_mn_per_hundred": []
})

colnames_all = df.columns.to_list()
colnames_unit = [n for n in colnames_all if n.endswith("_unit")]
colnames_per_hundred = [n for n in colnames_all if n.endswith("_per_hundred")]
colnames_per_portion = [n for n in colnames_all if n.endswith("_per_portion")]
colnames_other = list(
    set(colnames_all) - set(colnames_unit + colnames_per_hundred + colnames_per_portion)
)

Expected result (2 examples, other 2 lists are similar to 1st one):

colnames_unit:

['calcium_unit',
 'carbohydrates_unit',
 'cholesterol_unit',
 'copper_cu_unit',
 'energy_kcal_unit',
 'energy_unit',
 'fat_unit',
 'fatty_acids_total_saturated_unit',
 'fatty_acids_total_trans_unit',
 'fiber_insoluble_unit',
 'fiber_soluble_unit',
 'fiber_unit',
 'folate_total_unit',
 'folic_acid_unit',
 'hundred_unit',
 'iron_unit',
 'magnesium_unit']

 colnames_other:

 ['ingredients_en',
 'country',
 'id',
 'created_at',
 'barcode',
 'alcohol_by_volume']

Source Link

asked Jul 3, 2023 at 8:06

evilmandarine

Split Pandas dataset column based on values (suffixes: string operation)

In Python using Pandas, I am splitting a dataset column into 4 lists based on the suffix of the values. For the 3 suffixes I am using a list comprehension then for the 4th one, a set operation that substracts the 3 lists from the original list with all values:

colnames_all = df.columns.to_list()
colnames_unit = [n for n in colnames_all if n.endswith("_unit")]
colnames_per_hundred = [n for n in colnames_all if n.endswith("_per_hundred")]
colnames_per_portion = [n for n in colnames_all if n.endswith("_per_portion")]
colnames_other = list(
    set(colnames_all) - set(colnames_unit + colnames_per_hundred + colnames_per_portion)
)

However this does not look like the best way to do this. Is there a "better" way, i.e. shorter and/or more elegant/idiomatic?