Issue
I am trying to create a new column, say test, with several conditions based on 3 columns. I am tryiing to achieve this with tidyverse only. Here are my conditions:
- if I have
Yesin one column, regardless of other categories (No/Unknown/NA) in the same id across columns:previous_cabg, previous_pci, previous_amithen assignYesin test variable - if I have
Noin all columns for the same id then assign NO for the test variable - if I have
NOfor one column andNA/Unknownin the other columns for the same id then assign withNoin the test variable - if I have
Yesin all column for the same id then assignYesin the test variable - if I have
Yes in one column andNA/Unknownfor the same id in each column then assignYes`in test variable
This is the type of dataset I have:
structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L,
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L,
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L,
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L,
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L,
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L,
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L,
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L,
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L,
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L,
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L,
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L,
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L,
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes",
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier",
"1.01 Hospital identifier", "1.01 Hospital identifier"),
expected = c("value in level set", "value in level set",
"value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'",
"'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))
And this is how it looks, but only first 10 rows, if you look in detail, I have different groups of matches across the 3 columns
# A tibble: 60 x 4
id previous_cabg previous_pci previous_ami
<int> <fct> <fct> <fct>
1 112139 No No No
2 43919 No No No
3 92430 No Yes Yes
4 87137 NA NA No
5 95417 No No No
6 66955 NA NA NA
7 16293 NA NA No
8 61396 No Yes Yes
9 25379 No Yes No
10 79229 No No No
I am hoping to solve this only with tidyverse or a mix of tidyverse and r base.
This is what I have tried, yet I feel it is not so wise. I believe it is not wise, since this code will be part of automation process and if I will get other categories, than Yes and No, like Unknown as thisn appeared later in the next dataset extracts, then I wish the code will avoid all the other cases from the conditions I have given above.
dplyr::mutate(first_attack =
dplyr::case_when(previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes'
# deal with the unknown category
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes'| previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~ 'No',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~'Yes'
))
Solution
These operations are rowwise(), so they're not very efficient, but this solution in the tidyverse should cleanly achieve what you want.
Let us call your sample dataset by the name dataset. Then the following workflow
library(tidyverse)
# ...
# Code to generate your 'dataset'.
# ...
# Define custom logic across a single row.
get_first_attack <- function(values_across_row) {
# "Yes" overrides all other values.
if(isTRUE(any(values_across_row == "Yes"))){
return("Yes")
}
# "No" overrides all missing values: 'NA' and "Unknown".
else if(isTRUE(any(values_across_row == "No"))) {
return("No")
}
# "Unknown" overrides all other missing values: 'NA'.
else if(isTRUE(any(values_across_row == "Unknown"))) {
return("Unknown")
}
# All values are missing: 'NA'.
else {
return(as.character(NA))
}
}
dataset %>%
# Examine row by row.
dplyr::rowwise() %>%
# Compare values across each row according to the logic in 'get_first_attack()'.
dplyr::mutate(first_attack = get_first_attack(across(previous_cabg:previous_ami))) %>%
# Exit row-wise approach, to restore efficiency.
dplyr::ungroup() %>%
# Factor 'first_attack' exactly like its neighboring column.
dplyr::mutate(first_attack = factor(first_attack, levels = levels(previous_ami)))
should give you these results
# A tibble: 60 x 5
id previous_cabg previous_pci previous_ami first_attack
<int> <fct> <fct> <fct> <fct>
1 112139 No No No No
2 43919 No No No No
3 92430 No Yes Yes Yes
4 87137 NA NA No No
5 95417 No No No No
6 66955 NA NA NA NA
7 16293 NA NA No No
8 61396 No Yes Yes Yes
9 25379 No Yes No Yes
10 79229 No No No No
# ... with 50 more rows
where the first_attack column is fittingly defined as a factor with three levels: "Yes", "No", and "Unknown".
Answered By - Greg Answer Checked By - Mildred Charles (PHPFixing Admin)
0 Comments:
Post a Comment
Note: Only a member of this blog may post a comment.