Issue
I am trying to create a new column, say test, with several conditions based on 3 columns. I am tryiing to achieve this with tidyverse only. Here are my conditions:
- if I have
Yes
in one column, regardless of other categories (No/Unknown/NA
) in the same id across columns:previous_cabg, previous_pci, previous_ami
then assignYes
in test variable - if I have
No
in all columns for the same id then assign NO for the test variable - if I have
NO
for one column andNA/Unknown
in the other columns for the same id then assign withNo
in the test variable - if I have
Yes
in all column for the same id then assignYes
in the test variable - if I have
Yes in one column and
NA/Unknownfor the same id in each column then assign
Yes`in test variable
This is the type of dataset I have:
structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L,
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L,
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L,
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L,
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L,
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L,
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L,
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L,
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L,
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L,
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L,
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L,
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L,
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes",
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier",
"1.01 Hospital identifier", "1.01 Hospital identifier"),
expected = c("value in level set", "value in level set",
"value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'",
"'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))
And this is how it looks, but only first 10 rows, if you look in detail, I have different groups of matches across the 3 columns
# A tibble: 60 x 4
id previous_cabg previous_pci previous_ami
<int> <fct> <fct> <fct>
1 112139 No No No
2 43919 No No No
3 92430 No Yes Yes
4 87137 NA NA No
5 95417 No No No
6 66955 NA NA NA
7 16293 NA NA No
8 61396 No Yes Yes
9 25379 No Yes No
10 79229 No No No
I am hoping to solve this only with tidyverse or a mix of tidyverse and r base.
This is what I have tried, yet I feel it is not so wise. I believe it is not wise, since this code will be part of automation process and if I will get other categories, than Yes and No
, like Unknown
as thisn appeared later in the next dataset extracts, then I wish the code will avoid all the other cases from the conditions I have given above.
dplyr::mutate(first_attack =
dplyr::case_when(previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes'
# deal with the unknown category
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes'| previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~ 'No',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~'Yes'
))
Solution
These operations are rowwise()
, so they're not very efficient, but this solution in the tidyverse
should cleanly achieve what you want.
Let us call your sample dataset by the name dataset
. Then the following workflow
library(tidyverse)
# ...
# Code to generate your 'dataset'.
# ...
# Define custom logic across a single row.
get_first_attack <- function(values_across_row) {
# "Yes" overrides all other values.
if(isTRUE(any(values_across_row == "Yes"))){
return("Yes")
}
# "No" overrides all missing values: 'NA' and "Unknown".
else if(isTRUE(any(values_across_row == "No"))) {
return("No")
}
# "Unknown" overrides all other missing values: 'NA'.
else if(isTRUE(any(values_across_row == "Unknown"))) {
return("Unknown")
}
# All values are missing: 'NA'.
else {
return(as.character(NA))
}
}
dataset %>%
# Examine row by row.
dplyr::rowwise() %>%
# Compare values across each row according to the logic in 'get_first_attack()'.
dplyr::mutate(first_attack = get_first_attack(across(previous_cabg:previous_ami))) %>%
# Exit row-wise approach, to restore efficiency.
dplyr::ungroup() %>%
# Factor 'first_attack' exactly like its neighboring column.
dplyr::mutate(first_attack = factor(first_attack, levels = levels(previous_ami)))
should give you these results
# A tibble: 60 x 5
id previous_cabg previous_pci previous_ami first_attack
<int> <fct> <fct> <fct> <fct>
1 112139 No No No No
2 43919 No No No No
3 92430 No Yes Yes Yes
4 87137 NA NA No No
5 95417 No No No No
6 66955 NA NA NA NA
7 16293 NA NA No No
8 61396 No Yes Yes Yes
9 25379 No Yes No Yes
10 79229 No No No No
# ... with 50 more rows
where the first_attack
column is fittingly defined as a factor
with three levels: "Yes"
, "No"
, and "Unknown"
.
Answered By - Greg Answer Checked By - Mildred Charles (PHPFixing Admin)
0 Comments:
Post a Comment
Note: Only a member of this blog may post a comment.