0

I am using a large dataset which contains patients with 5 different cancer types. Each patient can have one or more cancer diagnosis, identified in the n.cancers column. There are also columns for each cancer type with the date of diagnosis. For those with n.cancer > 1, I am trying to assign the value for the primary cancer column based on which cancer was diagnosed first. I have tried multiple ways but everytime I end up with those with multiple cancers being assigned as other or multiple instead of the primary cancer diagnosis.

Data

eligible_df <- structure(list(
      patid = 1:3,
      cancer.blood = c(1, 0, 1),
      cancer.lung = c(0, 1, 1),
      cancer.breast = c(0, 0, 0),
      date.cancer.blood = structure(c(14610, NA, 15579), class = "Date"),
      date.cancer.lung = structure(c(NA, 15059, 16618), class = "Date"),
      date.cancer.breast = structure(c(NA, NA, NA), class = "Date")
    ), row.names = c(NA, -3L), class = "data.frame")

eligible_df$n.cancer <- rowSums(eligible_df[, c("cancer.blood","cancer.lung","cancer.breast")])
# eligible_df$primary_can <- NA

Desired Output

Patient 1 → "blood", Patient 2 → "lung", Patient 3 → "blood"

Attempt 1

# -------------------------------
# Step 1: Setup
# -------------------------------
# Cancer types in the dataset
cancer_types <- c("blood","breast","colorectal","lung","prostate","melanoma","others")

# Initialize column
eligible_df$primary_can <- "other"

# -------------------------------
# Step 2: Patients with 1 cancer
# -------------------------------
for (i in 1:nrow(eligible_df)) {
  if (eligible_df$n.cancer[i] == 1) {
    for (cancer in cancer_types) {
      if (eligible_df[[paste0("cancer.", cancer)]][i] == 1) {
        eligible_df$primary_can[i] <- cancer
        break
      }
    }
  }
}

# -------------------------------
# Step 3: Patients with >1 cancer
# -------------------------------
for (i in 1:nrow(eligible_df)) {
  if (eligible_df$n.cancer[i] > 1) {
    # Collect all available diagnosis dates for this patient
    cancer_dates <- c(
      breast     = eligible_df$date.cancer.breast[i],
      blood      = eligible_df$date.cancer.blood[i],
      colorectal = eligible_df$date.cancer.colorectal[i],
      lung       = eligible_df$date.cancer.lung[i],
      prostate   = eligible_df$date.cancer.prostate[i],
      melanoma   = eligible_df$date.cancer.melanoma[i],
      others     = eligible_df$date.cancer.others[i]
    )
    
    # Drop NAs
    cancer_dates <- cancer_dates[!is.na(cancer_dates)]
    
    # If there’s at least one valid date, assign the earliest cancer
    if (length(cancer_dates) > 0) {
      earliest_cancer <- names(which.min(cancer_dates))
      eligible_df$primary_can[i] <- earliest_cancer
    }
  }
}

# -------------------------------
# Step 4: Add multiple cancer flag
# -------------------------------
eligible_df$multiple_cancer <- as.integer(eligible_df$n.cancer > 1)

# -------------------------------
# Step 5: Sanity check
# -------------------------------
cat("\nDistribution of primary cancers:\n")
print(table(eligible_df$primary_can))

cat("\nPatients with multiple cancers:", sum(eligible_df$multiple_cancer), "\n")

Attempt 2

# For multiple cancers, find earliest date
multi_cancer_indices <- which(cancer_mapping$n.cancer > 1)

# Step 1 & 2: For each patient with multiple cancers, find the earliest diagnosis date
for(i in multi_cancer_indices) {
  patid <- cancer_mapping$patid[i]
  
  # Extract candidate cancer dates
  cancer_dates <- c(
    breast    = eligible_df$date.cancer.breast[eligible_df$patid == patid],
    blood     = eligible_df$date.cancer.blood[eligible_df$patid == patid],
    colorectal= eligible_df$date.cancer.colorectal[eligible_df$patid == patid],
    lung      = eligible_df$date.cancer.lung[eligible_df$patid == patid],
    prostate  = eligible_df$date.cancer.prostate[eligible_df$patid == patid],
    melanoma  = eligible_df$date.cancer.melanoma[eligible_df$patid == patid],
    others    = eligible_df$date.cancer.others[eligible_df$patid == patid]
  )
  
  # Remove NAs
  cancer_dates <- cancer_dates[!is.na(cancer_dates)]
  
  if(length(cancer_dates) > 0) {
    # Step 3: Find earliest cancer
    earliest_cancer <- names(which.min(cancer_dates))
    
    # Assign the primary cancer type
    cancer_mapping$primary_can[i] <- earliest_cancer
  }
}

# Step 4: Verify the distributions
table(cancer_mapping$primary_can)

Attempt 3

# For multiple cancers, find earliest date
# Step 1: Check which records have n.cancer > 1
multi_cancer_indices <- which(cancer_mapping$n.cancer > 1)

if(length(multi_cancer_indices) > 0) {
  for(i in multi_cancer_indices) {
    patid <- cancer_mapping$patid[i]
    patient_data <- eligible_df[eligible_df$patid == patid, ][1, ]
    
    # Step 2: For each n.cancer > 1 check the date values of date.cancer.breast, date.cancer.blood, date.cancer.prostate,
    # date.cancer.colorectal, date.cancer.lung, date.cancer.others, date.cancer.melanoma and keep the
    # column with the earliest date
    date_columns <- c("date.cancer.breast", "date.cancer.blood", "date.cancer.prostate", 
                      "date.cancer.colorectal", "date.cancer.lung", "date.cancer.others", 
                      "date.cancer.melanoma")
    
    earliest_date <- as.Date("9999-12-31")
    earliest_column <- NA
    
    for(date_col in date_columns) {
      if(date_col %in% names(patient_data) && !is.na(patient_data[[date_col]])) {
        cancer_date <- as.Date(patient_data[[date_col]])
        if(cancer_date < earliest_date) {
          earliest_date <- cancer_date
          earliest_column <- date_col
        }
      }
    }
    
    # Step 3: use the column with the earliest date to assign a value to primary_can column
    # e.g. date.cancer.breast then primary_can = breast
    if(!is.na(earliest_column)) {
      primary_cancer <- strsplit(earliest_column, "\\.")[[1]][3]
      cancer_mapping$primary_can[i] <- primary_cancer
    } else {
      cancer_mapping$primary_can[i] <- "multiple"
    }
  }
}

# Create lookup tables
cancer_lookup <- setNames(cancer_mapping$primary_can, cancer_mapping$patid)

# Assign to eligible_df
eligible_df$primary_can <- cancer_lookup[as.character(eligible_df$patid)]
eligible_df$primary_can[is.na(eligible_df$primary_can)] <- "other"

# Create multiple cancer indicator
eligible_df$multiple_cancer <- as.integer(eligible_df$n.cancer > 1)

# Step 4: Verify the distributions
cancer_table <- table(eligible_df$primary_can)
cat("\nDistribution of cancer types:\n")
print(cancer_table)

Regardless of the method nothing works. I have a feeling the issue may be with the cancer mapping but even when I tried without it I still couldnt get it to work correctly

3
  • 3
    Hello and welcome. To make this a eligible question, please provide the output of dput(head((eligible_df)) as text. You can edit your question to add the relevant information. Recommended read: stackoverflow.com/questions/5963269/… R is vectorised language. You do not need any of (this) for-loops, I recommend the R Inferno as first reference. Commented Sep 6 at 12:58
  • Is your code AI generated? Commented Sep 6 at 13:31
  • @Brentspine AI was used for formatting purposes and comments Commented Sep 6 at 13:49

2 Answers 2

2

Your life is being made difficult by your data format: you have information (diagnosis dates) that you want to summarise by both row and column. R will easily let you do one or the other, but not - as you have discovered - both at the same time. Put another way, your data frame is not tidy - because your column names contain information (the cancer type) that you need for your analysis.

So, start by removing the major difficulty: I'll put all the dates in a single column, indexed by a new column that defines the cancer type. I can ignore the cancer flags as they can be derived from the fact that the diagnosis date is not NA.

library(tidyverse)

eligible_df %>% 
  mutate(across(starts_with("date"), \(x) ifelse(is.na(x), NA_Date_, x))) %>% 
  pivot_longer(
    starts_with("date"),
    names_to = "Cancer.Type",
    values_to = "Diagnosis.Date"
  ) %>% 
  filter(!is.na(Diagnosis.Date))%>% 
  mutate(
    Diagnosis.Date = as.Date(Diagnosis.Date),
    Cancer.Type = str_extract(Cancer.Type, "[:alpha:]+$")
  )
# A tibble: 4 × 6
  patid cancer.blood cancer.lung cancer.breast Cancer.Type Diagnosis.Date
  <int>        <dbl>       <dbl>         <dbl> <chr>       <date>        
1     1            1           0             0 blood       2010-01-01    
2     2            0           1             0 lung        2011-03-26    
3     3            1           1             0 blood       2012-08-27    
4     3            1           1             0 lung        2015-07-02  

The first mutate fixes a problem with casting in the pivot_longer, caused by the presence of "generic" NAs. The second also fixes a casting issue and extracts the cancer type from the column name.

Now, getting the earliest diagnosis date is trivial. Just add

%>% 
  group_by(patid) %>% 
  slice_min(Diagnosis.Date)

to the pipe above, giving

# A tibble: 3 × 6
# Groups:   patid [3]
  patid cancer.blood cancer.lung cancer.breast Cancer.Type Diagnosis.Date
  <int>        <dbl>       <dbl>         <dbl> <chr>       <date>        
1     1            1           0             0 blood       2010-01-01    
2     2            0           1             0 lung        2011-03-26    
3     3            1           1             0 blood       2012-08-27 
Sign up to request clarification or add additional context in comments.

3 Comments

dplyr comes with if_else().
further, base::ifelse is specifically known to be class-unsafe, c.f., ifelse(TRUE,Sys.Date(),NA_Date_).
While the ifelse code does what is needed (and if_else fails), I think it is perhaps "by accident" and not the best way to fix the problem, where pivot_longer is failing. Instead using across(starts_with("date"), \(x) if (all(is.na(x))) NA_Date_ else x) since that is addressing that which you are trying to fix here. (If you're curious, the underlying problem that pivot_longer() finds is unearthed by comparing class(unclass(eligible_df$date.cancer.blood)) ("numeric") with class(unclass(eligible_df$date.cancer.breast)) ("logical"), perhaps a bug in R?)
0

I am trying to assign the value for the primary cancer column based on which cancer was diagnosed first.

1) Staying in wide format

eligible_df$nd = eligible_df |> 
  subset(select= names(eligible_df) |> startsWith('cancer')) |> rowSums()

eligible_df |>
  within({
    n = names(eligible_df) |> startsWith('date')
    m = eligible_df[n] |> sapply(as.numeric) |> Rfast::rowMins()
    date = eligible_df[n][cbind(seq(nrow(eligible_df)), m)] 
    first_type =  sub('date.cancer.', '', names(eligible_df)[n])[m]
    rm(n, m)
  })
  patid cancer.blood cancer.lung cancer.breast date.cancer.blood date.cancer.lung date.cancer.breast nd first_type       date
1     1            1           0             0        2010-01-01             <NA>               <NA>  1      blood 2010-01-01
2     2            0           1             0              <NA>       2011-03-26               <NA>  1       lung 2011-03-26
3     3            1           1             0        2012-08-27       2015-07-02               <NA>  2      blood 2012-08-27

2) Going long

with reshape() which is hard to learn but powerful.

eligible_df |>
  reshape(varying = list(
    type = names(eligible_df) |> startsWith('cancer'), 
    date = names(eligible_df) |> startsWith('date')
    ),
    v.names = c('diagnosis', 'date'),
    timevar = 'type',
    times = c('blood', 'lung', 'breast'),
    direction = 'l') |> subset(select=-id) |> sort_by(~patid)
         patid nd   type diagnosis       date
1.blood      1  1  blood         1 2010-01-01
1.lung       1  1   lung         0       <NA>
1.breast     1  1 breast         0       <NA>
2.blood      2  1  blood         0       <NA>
2.lung       2  1   lung         1 2011-03-26
2.breast     2  1 breast         0       <NA>
3.blood      3  2  blood         1 2012-08-27
3.lung       3  2   lung         1 2015-07-02
3.breast     3  2 breast         0       <NA>

After reshape call, it might be enough to do

eligible_df |>
 reshape(..) |>
 subset(diagnosis==1, select=-c(diagnosis, id)) |>
 sort_by(~patid+date) |> # cosmetics
 `rownames<-`(NULL)      # cosmetics 
  patid nd  type        date
1     1  1 blood  2010-01-01
2     2  1  lung  2011-03-26
3     3  2 blood  2012-08-27
4     3  2  lung  2015-07-02

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.