Issue
I am trying to create a new column, first_attack, with several conditions based on 3 columns. I am trying to achieve this with tidyverse only. Here are my conditions:
- if I have
Yes
in one column, regardless of other categories (No/Unknown/NA
) in the same id across columns:previous_cabg, previous_pci, previous_ami
then assignYes
in test variable - if I have
No
in all columns for the same id then assign NO for the test variable - if I have
NO
for one column andNA/Unknown
in the other columns for the same id then assign withNo
in the test variable - if I have
Yes
in all column for the same id then assignYes
in the test variable - if I have
Yes in one column and
NA/Unknownfor the same id in each column then assign
Yes`in test variable
This is the type of dataset I have:
structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L,
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L,
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L,
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L,
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L,
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L,
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L,
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L,
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L,
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L,
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L,
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L,
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L,
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes",
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier",
"1.01 Hospital identifier", "1.01 Hospital identifier"),
expected = c("value in level set", "value in level set",
"value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'",
"'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))
I thought the best is to use case_when. Thus, I oddly I proceeded with it. This is the code.
test_first_attack <- sample_n %>%
dplyr::mutate(first_attack =
dplyr::case_when(
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
# deal with the unknown category
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes'| previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~ 'No',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~'Yes'
))
This is the output.
test_first_attack <- structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L,
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L,
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L,
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L,
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L,
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L,
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L,
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L,
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L,
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L,
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L,
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L,
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L,
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes",
"No", "Unknown"), class = "factor"), first_attack = c("Yes",
"Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes",
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes",
NA, "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes", NA, "Yes",
"Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes",
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes",
"Yes", "Yes", "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "Yes"
)), row.names = c(NA, -60L), problems = structure(list(row = c(34136L,
121773L, 121779L), col = c("1.01 Hospital identifier", "1.01 Hospital identifier",
"1.01 Hospital identifier"), expected = c("value in level set",
"value in level set", "value in level set"), actual = c("CMH",
"CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'",
"'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))
As you can see, I get 'Yes' as output for the columns with id rows with NO across all the columns. Let alone taking into consideration more complex conditions like 'Unknown\NA\Yes' or other conditions.
Why I cannot get my desired output? Is there another way to achieve it with tidyverse only?
Solution
I think all you need is:
case_when(
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~ 'No',
TRUE ~ "Unknown/NA"
)
The first line will result in a "Yes" for every row where at least one of the three test columns is "Yes". The rows that satisfy the first line of the case_when
will not be tested for any subsequent conditions.
Of the remaining rows, the second line will result in a "No" where any of the three test columns is "No". The rows that meet the second line of the case_when
will not be tested for any subsequent conditions.
The last line assigns "Unknown/NA" to all remaining rows, which should be only those rows that don't have any "Yes" or "No" values in the three test columns.
For completeness, here's a full reproducible example using your sample data:
library(tidyverse)
test_first_attack = structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L,
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L,
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L,
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L,
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L,
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L,
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L,
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L,
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L,
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L,
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L,
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L,
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L,
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes",
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier",
"1.01 Hospital identifier", "1.01 Hospital identifier"),
expected = c("value in level set", "value in level set",
"value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'",
"'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))
test_first_attack = test_first_attack %>%
mutate(first_attack = case_when(
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~ 'No',
TRUE ~ "Unknown/NA"
))
test_first_attack
#> # A tibble: 60 x 5
#> id previous_cabg previous_pci previous_ami first_attack
#> <int> <fct> <fct> <fct> <chr>
#> 1 112139 No No No No
#> 2 43919 No No No No
#> 3 92430 No Yes Yes Yes
#> 4 87137 <NA> <NA> No No
#> 5 95417 No No No No
#> 6 66955 <NA> <NA> <NA> Unknown/NA
#> 7 16293 <NA> <NA> No No
#> 8 61396 No Yes Yes Yes
#> 9 25379 No Yes No Yes
#> 10 79229 No No No No
#> # … with 50 more rows
Answered By - eipi10 Answer Checked By - David Goodson (PHPFixing Volunteer)
0 Comments:
Post a Comment
Note: Only a member of this blog may post a comment.