0

I am aiming to selectively combine several dataframes into a single one in separate steps based on matches in two columns.

Primary Dataset

structure(list(OB_END_TIME = c("2006-01-01 01:00", "2006-01-01 00:00", 
"2006-01-01 00:00", "2006-01-01 01:00", "2006-01-01 01:00", "2006-01-01 18:00", 
"2006-01-01 01:00", "2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00"
), Weather.Category = c("Wind", "Flood", "Flood", "Wind", "Wind", 
"Wind", "Wind", "Wind", "Adhesion", "Subsidence"), SRC_ID = c(52L, 
67L, 103L, 2719L, 105L, 32L, 113L, 6704L, 117L, 
9569L), distance = c(4337.916253, 2825.720696, 2825.720696, 2397.887245, 
4513.448694, 3787.205117, 3774.967612, 4925.782405, 1178.442845, 
1748.086262), PRCP_AMT = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA), MEAN_WIND_DIR = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), MEAN_WIND_SPEED = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), MAX_GUST_DIR = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
    MAX_GUST_SPEED = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
    )), row.names = c(1L, 2L, 3L, 4L, 5L, 10L, 11L, 12L, 13L, 
14L), class = "data.frame")

Supporting Dataset 1

structure(list(OB_END_TIME = c("2006-01-01 00:00", "2006-01-01 00:00", 
"2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00", 
"2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00"
), SRC_ID = c(32L, 44L, 52L, 67L, 103L, 79L, 105L, 
117L, 113L, 30L), PRCP_AMT = c(0, 0, 0, 0.6, 0, 0, 0.4, 0.2, 
0.2, 0.2), HIGH_PRCN_LAT = c(1, 2, 44, 65, 
1, 1, 23, 43, 54.1346, 54.04502)), row.names = c(NA, 10L), class = "data.frame")

Supporting Dataset 2

structure(list(OB_END_TIME = c("2006-01-01 00:00", "2006-01-01 00:00", 
"2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00", 
"2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00"
),  SRC_ID = c(105L, 44L, 52L, 30L, 67L, 79L, 103L, 32L, 114L, 
    117L), MEAN_WIND_DIR = c(250L, 0L, 0L, 10L, 290L, 290L, 30L, 
    0L, 310L, 50L), MEAN_WIND_SPEED = c(3L, 0L, 0L, 10L, 5L, 
    8L, 17L, 0L, 1L, 24L), MAX_GUST_DIR = c(270L, 0L, 0L, 30L, 
    290L, 290L, 30L, 0L, 320L, 40L), MAX_GUST_SPEED = c(5L, 0L, 
    0L, 18L, 11L, 11L, 23L, 0L, 2L, 29L), HIGH_PRCN_LAT = c(1, 2, 44, 65, 
1, 1, 23, 43, 54.1346, 54.04502)), row.names = c(6L, 7L, 8L, 
10L, 11L, 12L, 16L, 17L, 18L, 19L), class = "data.frame")

r Metadata

platform        x86_64-w64-mingw32
version.string  R version 3.5.3 (2019-03-11)

Prepatory Code

dfX$OB_END_TIME <- format(as.POSIXct(strptime(rain$OB_END_TIME, "%Y-%m-%d %H:%M")), "%Y-%m-%d %H:%M")

All of the dataframes have already had the above code run on their datatime column in order to standardise it for the comparison/matching.

Matching Code

primaryDataFrame <- merge(x = primaryDataFrame, y = supportingDataFrame1, by = c("SRC_ID", "OB_END_TIME"))

Expected Result (Manually generated)

structure(list(OB_END_TIME = c("2006-01-01 01:00", "2006-01-01 00:00", 
"2006-01-01 00:00", "2006-01-01 01:00", "2006-01-01 00:00", "2006-01-01 18:00", 
"2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00"
), Weather.Category = c("Wind", "Flood", "Flood", "Wind", "Wind", 
"Wind", "Wind", "Wind", "Adhesion", "Subsidence"), SRC_ID = c(67L, 
52L, 103L, 2719L, 105L, 32L, 113L, 6704L, 117L, 9569L), distance = c(4337.916253, 2825.720696, 2825.720696, 2397.887245, 4513.448694, 3787.205117, 3774.967612, 4925.782405, 1178.442845, 1748.086262), PRCP_AMT = c(NA, 0, 0, NA, 0.4, NA, 0.2, NA, 0.2, NA), MEAN_WIND_DIR = c(NA, 0L, 30L, NA, 250L, NA, NA, NA, 50L, NA), MEAN_WIND_SPEED = c(NA, 0L, 17L, NA, 3L, NA, NA, NA, 24L, NA), MAX_GUST_DIR = c(NA, 0L, 30L, NA, 270L, NA, NA, NA, 40L, NA), 
    MAX_GUST_SPEED = c(NA, 0L, 23L, NA, 5L, NA, NA, NA, 29L, NA
    )), row.names = c(1L, 2L, 3L, 4L, 5L, 10L, 11L, 12L, 13L, 
14L), class = "data.frame")

Actual Result (Manually generated)

structure(list(OB_END_TIME = c("2006-01-01 00:00", "2006-01-01 00:00", 
"2006-01-01 00:00", "2006-01-01 00:00", "2006-01-01 00:00"), 
    Weather.Category = c("Flood", "Flood", "Wind", "Wind", "Adhesion"
    ), SRC_ID = c(52L, 103L, 105L, 113L, 117L), distance = c(2825.720696, 
    2825.720696, 4513.448694, 3774.967612, 1178.442845), PRCP_AMT.x = c(NA, NA, NA, NA, NA), PRCP_AMT.y = c(0, 
    0, 0.4, 0.2, 0.2), MEAN_WIND_DIR.x = c(NA, NA, NA, NA, NA), MEAN_WIND_DIR.y = c(0L, 30L, 250L, NA, 50L
    ), MEAN_WIND_SPEED.x = c(NA, NA, NA, NA, NA), MEAN_WIND_SPEED.y = c(0L, 17L, 3L, NA, 24L), MAX_GUST_DIR.x = c(NA, NA, NA, NA, NA, MAX_GUST_DIR.y = c(0L, 
    30L, 270L, NA, 40L), MAX_GUST_SPEED.x = c(NA, NA, NA, NA, NA), MAX_GUST_SPEED.y = c(0L, 23L, 5L, NA, 
    29L)), row.names = c(2L, 3L, 5L, 11L, 13L), class = "data.frame")

Problems with the results:

1) Columns that fail the match in the primary dataset are deleted.

2) The column that should be populated with the data is instead duplicated to PRCP_AMT.x and PRCP_AMT.y.

3) Unwanted columns are added to the primary dataset during the match (HIGH_PRCN_LAT).

1 Answer 1

1

for issue 1) you can just add all.x = T :

primaryDataFrame <- merge(x = primaryDataFrame, y = supportingDataFrame1, by = c("SRC_ID", "OB_END_TIME"), all.x = T)

For issue 2) then the merge is happening as to be expected. You have duplicate columns because they are called the same - you should give them different names and select / rename the one you wish to keep.

You can drop a column for 3) thus:

primaryDataFrame <- merge(x = primaryDataFrame, y = supportingDataFrame1, by = c("SRC_ID", "OB_END_TIME"), all.x = T) %>% 
    select(-HIGH_PRCN_LAT)
Sign up to request clarification or add additional context in comments.

1 Comment

Thank you for providing a solution to each of the three issues!

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.