Subset inside lapply for Pearson correlation - subset

I have tried iterate to explore pearson correlation between 2 variables with 7 more variables.
My df
df<-structure(list(d_peso1_v01 = structure(c(-8, -0.5, -13, -0.7,
-10.2, -9, -4.3, -1.6, 1.8, -11.3, -10.3, -4.6, 1.2, -2.8, -9.2
), format.spss = "F8.2", display_width = 13L), d_cintura1_v01 = structure(c(-6.5,
-3.5, -10, -2, -7, -3, -3, -4, -4.5, -9.5, -15.5, -3, 1, -4,
-12), format.spss = "F8.2", display_width = 16L), d_huglucagon_v01 = structure(c(-106.06,
NA, -75.38, 27.5, -325.38, -26.12, -104.26, 28.66, NA, -11.12,
-60.05, -76.38, -36.21, NA, -270.02), format.spss = "F8.2", display_width = 18L),
d_huinsulin_v01 = structure(c(-26.29, NA, -143.44, -410.55,
84.51, -121.56, -52.36, -151.83, NA, -42, -43.69, -82.96,
-51.27, NA, -163.12), format.spss = "F8.2", display_width = 17L),
d_huvisfatin_v01 = structure(c(-541.93, NA, -750.38, -611.9,
0, 139.61, -343.58, -149.2, NA, -91.54, -212.47, -844.05,
-353.86, NA, -1749.96), format.spss = "F8.2", display_width = 18L),
d_hupai1_v01 = structure(c(-785.4, NA, 115.96, -867.31, -10.84,
-1634, -331.21, 396.05, NA, -424.5, -143.09, 429.39, 799.11,
NA, -633.44), format.spss = "F8.2", display_width = 13L),
d_hucpeptide_v01 = structure(c(-189.33, NA, -612.6, -1250.86,
110.03, -614.69, -119.31, -305.55, NA, -104.55, -38.74, -411.38,
-65.48, NA, -143.75), format.spss = "F8.2", display_width = 18L),
d_huleptin_v01 = structure(c(-3145.34, NA, -5038.03, -2069.79,
-357.79, -1004.4, -1253.38, 365.69, NA, -2102.93, -1454.6,
-3380.95, -760.69, NA, -6078.46), format.spss = "F8.2", display_width = 16L),
d_hughrelin_v01 = structure(c(-290.46, NA, -898.76, -726.4,
-217.49, 41.13, 93.89, 436.93, NA, 12.85, -221.54, -134,
-200.15, NA, 261.3), format.spss = "F8.2", display_width = 18L),
d_hba1c_v01 = structure(c(0.02, NA, -0.26, -0.17, -1.05,
-0.41, -0.47, -0.21, NA, -0.14, -0.14, -0.43, 0.61, NA, -1.33
), format.spss = "F8.2", display_width = 13L), grupo_int_v00 = structure(c(2L,
1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L), .Label = c("A",
"B"), label = "Grupo de intervención", class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
I have performed all the steps to do it for the whole database, but I want to subset according to grupo_int_v00, which is a factor differentiating 2 treatments (A or B)
my_data <- dat[, c("d_peso1_v01", "d_cintura1_v01", "d_huglucagon_v01", "d_huinsulin_v01", "d_huvisfatin_v01", "d_hupai1_v01", "d_hucpeptide_v01", "d_huleptin_v01", "d_hughrelin_v01", "d_hba1c_v01", "grupo_int_v00")]
peso_pearson_01 <- lapply(my_data, function(x) {cor.test(x, my_data$d_peso1_v01, method = "pearson")})
cintura_pearson_01 <- lapply(my_data, function(x) {cor.test(x, my_data$d_cintura1_v01, method = "pearson")})
cintura_peso_01 <- do.call(c, list(peso_pearson_01, cintura_pearson_01))
max <- max(sapply(cintura_peso_01, length))
cintura_peso_01 <- do.call(rbind, lapply(cintura_peso_01, function(z) c(z, rep(NA, max-length(z)))))
How can I insert grupo_int_v00 in the syntax?
peso_pearson_01 <- lapply(my_data, function(x) {subset(dat$grupo_int_v00 == "A"), cor.test(x, my_data$d_peso1_v01, method = "pearson")})
Error: inesperado ',' in "lapply(my_data, function(x) {subset(dat$grupo_int_v00 == "A"),"
Thank you!

Related

How can I prevent Pivot_Wide from adding new rows in my data?

This is a dput()sample of my data:
structure(list(ID = c("101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101"), IDA = c("1000",
"1279", "1392", "534", "835", "910", "748", "589", "675", "500",
"1243", "635", "1181", "791", "755", "1069", "640", "1229", "1856",
"116", "767", "1126", "863", "1141", "1858", "899", "5", "225",
"175", "1764", "1017", "497", "771", "41", "816", "1046", "439",
"930", "1350", "641", "1057", "1021", "503", "553", "1738", "1379",
"774", "442", "1113", "1503"), DATE = structure(c(1497315600,
1552352400, 1552957200, 1390438800, 1439427600, 1479776400, 1455757200,
1402534800, 1409187600, 1383008400, 1536022800, 1414630800, 1545094800,
1551142800, 1461805200, 1483405200, 1420506000, 1534813200, 1493600400,
1348448400, 1458176400, 1521075600, 1464656400, 1527555600, 1504573200,
1478134800, 1278378000, 1320886800, 1309395600, 1598576400, 1500512400,
1385600400, 1436403600, 1284426000, 1430960400, 1485824400, 1381971600,
1477962000, 1510023600, 1420509600, 1508806800, 1499302800, 1386205200,
1379466000, 1555290000, 1565226000, 1435798800, 1494896400, 1516064400,
1593478800), tzone = "UTC", class = c("POSIXct", "POSIXt")),
NR = c("CH-1000", " CH-1279", "CH-1392",
"CH-0534", "CH-0835", " CH-0910", "CH-0748",
"CH-0589", "CH-0675", "CH-0500", "CH-1243",
"CH-0635", "CH-1181", "CH-0791", "CH-0755",
"CH-1069", "CH-0640", "CH-1229", "CH-1856",
"CH-0116", "CH-0767", "CH-1126", "CH-0863",
"CH-1141", "CH-1858", "CH-0899", "CH-0005",
"CH-0225", "CH-0175", "CH-1764", "CH-1017",
"CH-0497", "CH-0771", "CH-0041", "CH-0816",
"CH-1046", "CH-0439", "CH-0930", "CH-1350",
"CH-0641", "CH-1057", "CH-1021", "CH-0503",
"CH-0553", "CH-1738", "CH-1379", "CH-0774",
"CH-0442", "CH-1113", "CH-1503"), PAT = c("101-1000",
"101-1279", "101-1392", "101-534", "101-835", "101-910",
"101-748", "101-589", "101-675", "101-500", "101-1243", "101-635",
"101-1181", "101-791", "101-755", "101-1069", "101-640",
"101-1229", "101-1856", "101-116", "101-767", "101-1126",
"101-863", "101-1141", "101-1858", "101-899", "101-5", "101-225",
"101-175", "101-1764", "101-1017", "101-497", "101-771",
"101-41", "101-816", "101-1046", "101-439", "101-930", "101-1350",
"101-641", "101-1057", "101-1021", "101-503", "101-553",
"101-1738", "101-1379", "101-774", "101-442", "101-1113",
"101-1503"), INT1 = c(NA, NA, NA, 280035, 280040, NA,
280040, 280040, 285030, 245040, NA, 280035, NA, NA, 280040,
NA, 220035, NA, NA, 280040, 280040, NA, 245005, NA, NA, 275005,
240070, 220035, 280040, NA, NA, 280040, 280040, 240005, 280040,
NA, 280040, 240005, 245040, 240030, NA, NA, 260010, NA, NA,
NA, 280040, NA, NA, NA), INT2 = c(NA, NA, NA, NA, NA,
NA, NA, 280040, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 240030, NA, 260005, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), INT3 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), INT4 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX1 = c(NA, 280005, 220035, NA, NA, NA,
NA, NA, NA, NA, 280050, NA, 240080, 280050, NA, 240085, NA,
280050, 270010, NA, NA, 280050, NA, 280005, NA, NA, NA, NA,
NA, 275045, 280050, NA, NA, NA, NA, 245005, NA, NA, 245040,
NA, NA, 280050, NA, NA, 220035, 280050, NA, 255005, 280050,
220005), INTX2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 240085, NA, NA, NA, NA, NA, NA, NA, NA, 280050,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), INTX3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX4 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), KAT = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))
What I needed to do first was
first, to recode some values from INT1:INT4 and INTX1:INTX4 and put them in new columns. To this aim, I used long_pivot as below:
longDATA <- DATA %>%
pivot_longer(cols = c('INT1':'INTX4'),
names_to = "INT", values_to = "Code")
Then I used the long list to mutate new variables as below:
longDATA1 <- longDATA %>% mutate(palm = case_when(Code == 210025 ~ 1))
longDATA2 <- longDATA1 %>% mutate(bio = case_when(Code == 210015 ~ '12.06.25',Code == 210020 ~ '12.07.25',Code == 275015 ~ '12.06.25',Code == 275020 ~ '12.07.25'))
longDATA3 <- longDATA2 %>% mutate(EPX = case_when(Code == 280005 ~ 1, Code == 280010 ~ 1))
Then I need to return it to wide format. I used the code below:
WideDATA <- longDATA3 %>% pivot_wider(names_from = INT, values_from = Code)
Below you can see the output but it does not show the problem as I have a huge dataset. I realized the number of rows has changed from 2480 (my initial data before doing longer-pivot)to 2633 (the Wide data).I realized that the added rows are created when in one of the EPX, bio or palm columns a value has been recoded. for example if there is a row where INT1 and INT2 are filled with values other than NA (e.g., 280010 and 280040), EPX is assigned value 1, based on the mutation that we did earlier. However, EPX=1 and INT1=280010 appear in one row and 280040 appears in another row. like the photo below.
[1]: https://i.stack.imgur.com/jjEea.png
I have spent a week to figure this out but no luck. I would highly appreciate your advice.
structure(list(ID = c("101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101"), IDA = c("1000",
"1279", "1392", "534", "835", "910", "748", "589", "675", "500",
"1243", "635", "1181", "791", "755", "1069", "640", "1229", "1856",
"116", "767", "1126", "863", "1141", "1858", "899", "5", "225",
"175", "1764", "1017", "497", "771", "41", "816", "1046", "439",
"930", "1350", "641", "1057", "1021", "503", "553", "1738", "1379",
"774", "442", "1113", "1503"), DATE = structure(c(1497315600,
1552352400, 1552957200, 1390438800, 1439427600, 1479776400, 1455757200,
1402534800, 1409187600, 1383008400, 1536022800, 1414630800, 1545094800,
1551142800, 1461805200, 1483405200, 1420506000, 1534813200, 1493600400,
1348448400, 1458176400, 1521075600, 1464656400, 1527555600, 1504573200,
1478134800, 1278378000, 1320886800, 1309395600, 1598576400, 1500512400,
1385600400, 1436403600, 1284426000, 1430960400, 1485824400, 1381971600,
1477962000, 1510023600, 1420509600, 1508806800, 1499302800, 1386205200,
1379466000, 1555290000, 1565226000, 1435798800, 1494896400, 1516064400,
1593478800), tzone = "UTC", class = c("POSIXct", "POSIXt")),
NR = c("CH-1000", " CH-1279", "CH-1392",
"CH-0534", "CH-0835", " CH-0910", "CH-0748",
"CH-0589", "CH-0675", "CH-0500", "CH-1243",
"CH-0635", "CH-1181", "CH-0791", "CH-0755",
"CH-1069", "CH-0640", "CH-1229", "CH-1856",
"CH-0116", "CH-0767", "CH-1126", "CH-0863",
"CH-1141", "CH-1858", "CH-0899", "CH-0005",
"CH-0225", "CH-0175", "CH-1764", "CH-1017",
"CH-0497", "CH-0771", "CH-0041", "CH-0816",
"CH-1046", "CH-0439", "CH-0930", "CH-1350",
"CH-0641", "CH-1057", "CH-1021", "CH-0503",
"CH-0553", "CH-1738", "CH-1379", "CH-0774",
"CH-0442", "CH-1113", "CH-1503"), PAT = c("101-1000",
"101-1279", "101-1392", "101-534", "101-835", "101-910",
"101-748", "101-589", "101-675", "101-500", "101-1243", "101-635",
"101-1181", "101-791", "101-755", "101-1069", "101-640",
"101-1229", "101-1856", "101-116", "101-767", "101-1126",
"101-863", "101-1141", "101-1858", "101-899", "101-5", "101-225",
"101-175", "101-1764", "101-1017", "101-497", "101-771",
"101-41", "101-816", "101-1046", "101-439", "101-930", "101-1350",
"101-641", "101-1057", "101-1021", "101-503", "101-553",
"101-1738", "101-1379", "101-774", "101-442", "101-1113",
"101-1503"), palm = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), bio= c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), EPx = c(NA,
NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), INT1 = c(NA, NA, NA, 280035, 280040, NA,
280040, 280040, 285030, 245040, NA, 280035, NA, NA, 280040,
NA, 220035, NA, NA, 280040, 280040, NA, 245005, NA, NA, 275005,
240070, 220035, 280040, NA, NA, 280040, 280040, 240005, 280040,
NA, 280040, 240005, 245040, 240030, NA, NA, 260010, NA, NA,
NA, 280040, NA, NA, NA), INT2 = c(NA, NA, NA, NA, NA,
NA, NA, 280040, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 240030, NA, 260005, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), INT3 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), INT4 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX1 = c(NA, 280005, 220035, NA, NA, NA,
NA, NA, NA, NA, 280050, NA, 240080, 280050, NA, 240085, NA,
280050, 270010, NA, NA, 280050, NA, 280005, NA, NA, NA, NA,
NA, 275045, 280050, NA, NA, NA, NA, 245005, NA, NA, 245040,
NA, NA, 280050, NA, NA, 220035, 280050, NA, 255005, 280050,
220005), INTX2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 240085, NA, NA, NA, NA, NA, NA, NA, NA, 280050,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), INTX3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX4 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), KAT = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))
I copied pasted your code and both the DATA and WideDATA objects have the same number of rows (10), just with the desired variables added in WideDATA. Its possible there is something in your full data set that is creating the additional rows. I see some NAs in the sample you shared, and its worth noting that case_when doesn't treat those specially. If you need them handled as NA explicitly, you need to include is.na in a conditional statement. If sharing the full data set isn't possible, maybe randomly sampling would help.
e.g. dput(sample_n(DATA, 50))
I'm adding this as an answer though I realize it may need further editing as I couldn't usefully share the following in a comment.
In the code below I mostly kept your examples intact, but put everything into a single piped chunk of code. It might be simpler to stick with if_else in cases where there's only a single condition, but there's no harm using case_when throughout if you prefer the syntax.
You'll note I include TRUE ~ ... in each. Without a way to evaluate general cases, you get NA whenever the explicit statements aren't true.
Hopefully this lets you spot what is happening in your full data set, and if not please continue to update the question.
library(tidyverse)
DATA <- structure(list(ID = c("101", "101", "101", "101", "101", "101","101", "101", "101", "101"), IDA = c("1", "1", "2", "3", "4","5", "5", "1859", "1860", "1861"), DATE = structure(c(1300928400,1277946000, 1277946000, 1278550800, 1278550800, 1453770000, 1329958800,1506474000, 1485133200, 1485133200), tzone = "UTC", class = c("POSIXct","POSIXt")), NR = c("CH-0001", "CH-0001","CH-0002", "CH-0003", "CH-0004", "CH-0005","CH-0005", "CH-1859", "CH-1860", "CH-1861"), PAT = c("101-1", "101-1", "101-2", "101-3", "101-4", "101-5","101-5", "101-1859", "101-1860", "101-1861"), INT1 = c(245005,280040, 280040, 280040, 280040, 240040, 240040, NA, NA, NA),INT2 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), INT3 = c(NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_), INT4 = c(NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_), INTX1 = c(NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_), INTX2 = c(NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), INTX173 = c(NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), INTX4 = c(NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), KAT = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1)), row.names = c(NA,-10L), class = c("tbl_df", "tbl", "data.frame"))
longDATA <- DATA %>%
pivot_longer(cols = c('INT1':'INTX4'),
names_to = "INT", values_to = "Code") %>%
mutate(
palm = case_when(
Code == 210025 ~ 1,
TRUE ~ 0),
bio = case_when(
Code == 210015 ~ '12.06.25',
Code == 210020 ~ '12.07.25',
Code == 275015 ~ '12.06.25',
Code == 275020 ~ '12.07.25',
TRUE ~ ''
),
EPX = case_when(
Code == 280005 ~ 1,
Code == 280010 ~ 1,
TRUE ~ 0
)
) %>%
pivot_wider(names_from = INT,
values_from = Code)
longDATA
#> # A tibble: 10 × 17
#> ID IDA DATE NR PAT KAT palm bio EPX INT1
#> <chr> <chr> <dttm> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 101 1 2011-03-24 01:00:00 CH-0001 101-1 0 0 "" 0 245005
#> 2 101 1 2010-07-01 01:00:00 CH-0001 101-1 0 0 "" 0 280040
#> 3 101 2 2010-07-01 01:00:00 CH-0002 101-2 0 0 "" 0 280040
#> 4 101 3 2010-07-08 01:00:00 CH-0003 101-3 0 0 "" 0 280040
#> 5 101 4 2010-07-08 01:00:00 CH-0004 101-4 0 0 "" 0 280040
#> 6 101 5 2016-01-26 01:00:00 CH-0005 101-5 0 0 "" 0 240040
#> 7 101 5 2012-02-23 01:00:00 CH-0005 101-5 0 0 "" 0 240040
#> 8 101 1859 2017-09-27 01:00:00 CH-1859 101-1… 1 0 "" 0 NA
#> 9 101 1860 2017-01-23 01:00:00 CH-1860 101-1… 1 0 "" 0 NA
#> 10 101 1861 2017-01-23 01:00:00 CH-1861 101-1… 1 0 "" 0 NA
#> # … with 7 more variables: INT2 <dbl>, INT3 <dbl>, INT4 <dbl>, INTX1 <dbl>,
#> # INTX2 <dbl>, INTX173 <dbl>, INTX4 <dbl>
Created on 2022-12-12 with reprex v2.0.2
Try this, check where I noted #fix
library(dplyr)
library(tidyr)
longDATA <- DATA %>%
pivot_longer(cols = c('INT1':'INTX4'),
names_to = "INT", values_to = "Code")
longDATA1 <- longDATA %>% mutate(palm = case_when(Code == 210025 ~ 1,
TRUE ~ NA_real_)) #fix
longDATA2 <- longDATA1 %>% mutate(bio = case_when(Code == 210015 ~ '12.06.25',
Code == 210020 ~ '12.07.25',
Code == 275015 ~ '12.06.25',
Code == 275020 ~ '12.07.25',
TRUE ~ NA_character_))#fix
longDATA3 <- longDATA2 %>% mutate(EPX = case_when(Code == 280005 ~ 1,
Code == 280010 ~ 1,
TRUE ~ NA_real_))#fix
WideDATA <- longDATA3 %>% pivot_wider(id_cols = ID:KAT, #fix
names_from = INT, values_from = Code)
#########
# Check #
#########
nrow(DATA)
#> [1] 50
nrow(WideDATA)
#> [1] 50

Difficulties with subsetting clustered family data

I am trying to subset the following dataset so that both sons and fathers with resting heart rate values are included in a new data frame. When I use the is.na function, since fathers and sons are on their own individual rows, fathers without a value for sons' resting heart rate are deleted and vice versa.
I have tried the following code but I am not having any luck:
for (i in 1:nrow(datadiss)){
if (datadiss$Fatheridorsonid == "sonid" & isFALSE(is.na(datadiss$sonrhr))){
data2<-datadiss[which(datadiss$sonrhr>0),]
}else if (datadiss$Fatheridorsonid == "Fatherid" & isFALSE(is.na(datadiss$dadrhr))){
data2<-datadiss[which(datadiss$dadrhr>0),]
}
}
An example of my dataset can be found here:
structure(list(fid = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6,
6, 7, 7, 8, 8, 9, 9), sonid = c(15576123, NA, 3100866, NA, NA,
13206733, NA, 3098080, 3938697, 7089160, NA, 10580951, 13465844,
NA, 10801693, NA, 10797768, NA, NA, 9670190), sonrhr = c(75,
NA, 89, NA, NA, 100, NA, 65, 62, 56, NA, 78, 85, NA, 67, NA,
66, NA, NA, 87), sonbmi = c(19.66, NA, 19.59, NA, NA, 21.54,
NA, 22.84, 22.88, 29.09, NA, 21.53, 22.03, NA, 23.41, NA, 22.02,
NA, NA, 22.44), sonphysenerg = c(329, NA, 234, NA, NA, NA, NA,
295, 344, 299, NA, 313, 351, NA, NA, NA, NA, NA, NA, NA), violent = c(0,
NA, 0, NA, NA, 0, NA, 0, 0, 1, NA, 1, 0, NA, 0, NA, 0, NA, NA,
0), nonviolent = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), Fatherid = c(NA, 15576243, NA, 3100889, 123456, NA,
3938697, NA, NA, NA, 10580442, NA, NA, 10804692, NA, 10797798,
NA, 12797798, 9670176, NA), dadrhr = c(NA, 76, NA, 87, 49, NA,
67, NA, NA, NA, 57, NA, NA, 78, NA, 58, NA, 100, 89, NA), sonorfather = structure(c(1L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 2L, 1L), .Label = c("Son", "Father"), class = "factor")), class = "data.frame", row.names = c(NA,
-20L), codepage = 65001L)

Subset Data based on User Login

I have a shiny application with a login interface. I need to subset data based on the user login. Within the data, I have a column which marks the data whether the data belongs to them. The column is called "memberstate" and essentially contains the login username. The code I am tyring to use is borrowed from R Studio and is as follows:
user <- reactive({
session$user
})
### Code to manage row level security
isManager <- reactive({
if (user() == "manager"){
return(TRUE)
} else{
return(FALSE)
}
})
# Based on the logged in user, pull out only the data this user
# should be able to see.
data <- read.csv("data/DHA&PPTabletsClean.csv")
myData <- reactive({
if (isManager()){
# If a manager, show everything.
return(data)
} else{
# If a Member State, only show their own data.
return(data[data$memberstate == user(),])
}
})
I then try and use MyData for plotting graphs. I am getting the following error message
"Error in Mydata: could not find function "Mydata" " I am a newbie to R Shiny. Kindly assist.
Part of the data is as follows:
dput(data)
structure(list(Brand = c("Malaril", "Malaril", "Malaril", "Malaril",
"Malaril", "Malaril", "Malaril", "Malaril", "Malaril", "Malaril",
"Malaril", "Malaril", "Malaril", "Malaril", "Malaril", "Malaril",
"Malaril", "Malaril", "Malaril", "Malaril", "Malaril", "Malaril",
"Malaril", "Malaril", "Malaril", "Malaril", "Malaril", "Malaril",
"Malaril", "Malaril", "Malaril", "Malaril", "Malaril", "Malaril"
), ActiveIngredient = c("Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Piperaquine Phosphate", "Piperaquine Phosphate",
"Piperaquine Phosphate", "Piperaquine Phosphate", "Piperaquine Phosphate",
"Piperaquine Phosphate", "Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Piperaquine Phosphate", "Piperaquine Phosphate",
"Piperaquine Phosphate", "Piperaquine Phosphate", "Piperaquine Phosphate",
"Piperaquine Phosphate", "Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Piperaquine Phosphate", "Piperaquine Phosphate",
"Piperaquine Phosphate", "Piperaquine Phosphate"), Assay = c(94.9,
94.9, 94.9, 94.9, 94.9, 94.9, 101.6, 101.6, 101.6, 101.6, 101.6,
101.6, 95, 95, 95, 95, 95, 95, 100.2, 100.2, 100.2, 100.2, 100.2,
100.2, 96.4, 96.4, 96.4, 96.4, 96.4, 96.4, 100.6, 100.6, 100.6,
100.6), Assayperc = c(0.949, 0.949, 0.949, 0.949, 0.949, 0.949,
1.016, 1.016, 1.016, 1.016, 1.016, 1.016, 0.95, 0.95, 0.95, 0.95,
0.95, 0.95, 1.002, 1.002, 1.002, 1.002, 1.002, 1.002, 0.965,
0.965, 0.965, 0.965, 0.965, 0.965, 1.006, 1.006, 1.006, 1.006
), AssayLL = c(90L, 90L, 90L, 90L, 90L, 90L, 93L, 93L, 93L, 93L,
93L, 93L, 90L, 90L, 90L, 90L, 90L, 90L, 93L, 93L, 93L, 93L, 93L,
93L, 90L, 90L, 90L, 90L, 90L, 90L, 93L, 93L, 93L, 93L), AssaypercLL = c(0.9,
0.9, 0.9, 0.9, 0.9, 0.9, 0.93, 0.93, 0.93, 0.93, 0.93, 0.93,
0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.93, 0.93, 0.93, 0.93, 0.93, 0.93,
0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.93, 0.93, 0.93, 0.93), AssayUL = c(110L,
110L, 110L, 110L, 110L, 110L, 107L, 107L, 107L, 107L, 107L, 107L,
110L, 110L, 110L, 110L, 110L, 110L, 107L, 107L, 107L, 107L, 107L,
107L, 110L, 110L, 110L, 110L, 110L, 110L, 107L, 107L, 107L, 107L
), AssaypercUL = c(1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.07, 1.07,
1.07, 1.07, 1.07, 1.07, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.07, 1.07,
1.07, 1.07, 1.07, 1.07, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.07, 1.07,
1.07, 1.07), DateManufacture = c("1/10/2017", "1/10/2017", "1/10/2017",
"1/10/2017", "1/10/2017", "1/10/2017", "1/10/2017", "1/10/2017",
"1/10/2017", "1/10/2017", "1/10/2017", "1/10/2017", "1/6/2018",
"1/6/2018", "1/6/2018", "1/6/2018", "1/6/2018", "1/6/2018", "1/6/2018",
"1/6/2018", "1/6/2018", "1/6/2018", "1/6/2018", "1/6/2018", "1/8/2018",
"1/8/2018", "1/8/2018", "1/8/2018", "1/8/2018", "1/8/2018", "1/8/2018",
"1/8/2018", "1/8/2018", "1/8/2018"), ExpiryDate = c("1/9/2019",
"1/9/2019", "1/9/2019", "1/9/2019", "1/9/2019", "1/9/2019", "1/9/2019",
"1/9/2019", "1/9/2019", "1/9/2019", "1/9/2019", "1/9/2019", "1/5/2020",
"1/5/2020", "1/5/2020", "1/5/2020", "1/5/2020", "1/5/2020", "1/5/2020",
"1/5/2020", "1/5/2020", "1/5/2020", "1/5/2020", "1/5/2020", "1/7/2020",
"1/7/2020", "1/7/2020", "1/7/2020", "1/7/2020", "1/7/2020", "1/7/2020",
"1/7/2020", "1/7/2020", "1/7/2020"), ShelfLifeYrs = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), ShelfLifeDysRecpt = c(-547L, -547L, -547L, -547L, -547L, -547L,
-547L, -547L, -547L, -547L, -547L, -547L, -304L, -304L, -304L,
-304L, -304L, -304L, -304L, -304L, -304L, -304L, -304L, -304L,
-243L, -243L, -243L, -243L, -243L, -243L, -243L, -243L, -243L,
-243L), DateReceiptSample = c("1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021"), COADateIssue = c("27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021"), TestingOutcome = c("Pass", "Pass", "Pass", "Pass",
"Pass", "Pass", "Pass", "Pass", "Pass", "Pass", "Pass", "Pass",
"Fail", "Fail", "Fail", "Fail", "Fail", "Fail", "Fail", "Fail",
"Fail", "Fail", "Fail", "Fail", "Pass", "Pass", "Pass", "Pass",
"Pass", "Pass", "Pass", "Pass", "Pass", "Pass"), FailureReason = c("",
"", "", "", "", "", "", "", "", "", "", "", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "", "", "", "", "", "", "",
"", "", ""), Dissolution = c(77L, 81L, 84L, 86L, 82L, 81L, 100L,
96L, 98L, 101L, 97L, 102L, 62L, 59L, 62L, 66L, 65L, 61L, 99L,
95L, 97L, 103L, 99L, 102L, 97L, 80L, 81L, 86L, 80L, 80L, 103L,
101L, 101L, 101L), Dissolutionperc = c(0.77, 0.81, 0.84, 0.86,
0.82, 0.81, 1, 0.96, 0.98, 1.01, 0.97, 1.02, 0.62, 0.59, 0.62,
0.66, 0.65, 0.61, 0.99, 0.95, 0.97, 1.03, 0.99, 1.02, 0.97, 0.8,
0.81, 0.86, 0.8, 0.8, 1.03, 1.01, 1.01, 1.01), DissolLL = c(70L,
70L, 70L, 70L, 70L, 70L, 80L, 80L, 80L, 80L, 80L, 80L, 70L, 70L,
70L, 70L, 70L, 70L, 80L, 80L, 80L, 80L, 80L, 80L, 70L, 70L, 70L,
70L, 70L, 70L, 80L, 80L, 80L, 80L), DissolutionpercLL = c(0.7,
0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.7, 0.7,
0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.7, 0.7, 0.7,
0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8), Mass = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), pH = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), Dosageform = c("Tablet", "Tablet", "Tablet", "Tablet", "Tablet",
"Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet",
"Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet",
"Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet",
"Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet",
"Tablet"), Therapeuticclass = c("Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial"
), memberstate = c("", "", "", "", "", "ruvimbo", "ruvimbo",
"ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo",
"ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo",
"ruvimbo", "ruvimbo", "ruvimbo", "", "", "", "", "", "", "",
"", "", "", "", "")), class = "data.frame", row.names = c(NA,
-34L))
Regards
Chris

Divide certain elements in each sub list

b = [[2021, 55, -0.65, 7.61, 10.65, 41.37, 3.39, 12.25, -10.14, 7.61, 8.84],
[2022, 56, 3.0, -0.13, 8.84, 27.25, -0.48, 2.54, 12.43, 7.56, 3.37]]
I want to divide elements [2:10] of each sub list in b by 100. Result expected:
a = [2021, 55, -0.0065, 0.0761, 0.1065, 0.4137, 0.0339, 0.1225, -0.1014, 0.0761, 0.0884], etc
I've tried:
a = [item[2:10] /100 for item in b] Also tried: a = [item[2:10] / 100 for item in x] for x in b]
The first one gives "unsupported operand type for /: list and int". Second one gives "int object not subscriptable"
A minor error in your list comprehension, you were slicing in the wrong place. What you need to do is this:
a = [x[:2] + [item / 100 for item in x[2:]] for x in b]
print(a)
Output:
[[2021, 55, -0.006500000000000001, 0.0761, 0.1065, 0.41369999999999996, 0.0339, 0.1225, -0.1014, 0.0761, 0.08839999999999999], [2022, 56, 0.03, -0.0013, 0.08839999999999999, 0.2725, -0.0048, 0.0254, 0.1243, 0.0756, 0.0337]]
Without list comprehension
In [12]: b = [[2021, 55, -0.65, 7.61, 10.65, 41.37, 3.39, 12.25, -10.14, 7.61, 8
...: .84],
...: [2022, 56, 3.0, -0.13, 8.84, 27.25, -0.48, 2.54, 12.43, 7.56, 3.37
...: ]]
...:
In [13]: for i in range(len(b)):
...: if len(b[i]) >= 10:
...: for j in range(2,10):
...: b[i][j] = b[i][j]/100
output:
[[2021,
55,
-0.006500000000000001,
0.0761,
0.1065,
0.41369999999999996,
0.0339,
0.1225,
-0.1014,
0.0761,
8.84],
[2022,
56,
0.03,
-0.0013,
0.08839999999999999,
0.2725,
-0.0048,
0.0254,
0.1243,
0.0756,
3.37]]
No need to take slices out first... change your list-comprehension to:
b = [[2021, 55, -0.65, 7.61, 10.65, 41.37, 3.39, 12.25, -10.14, 7.61, 8.84],
[2022, 56, 3.0, -0.13, 8.84, 27.25, -0.48, 2.54, 12.43, 7.56, 3.37]]
res = [[item / 100 if 2 <= i < 10 else item for i, item in enumerate(lst)] for lst in b]
print(res)
Output:
[[2021, 55, -0.006500000000000001, 0.0761, 0.1065, 0.41369999999999996, 0.0339, 0.1225, -0.1014, 0.0761, 8.84], [2022, 56, 0.03, -0.0013, 0.08839999999999999, 0.2725, -0.0048, 0.0254, 0.1243, 0.0756, 3.37]]
res = [x[:2] + [x[i]/100 for i in range(len(x)) if i > 1] for x in b]\
print(res)

Why is numpy's kron so fast?

I was trying to implement a kronecker product function. Below are three ideas that I have:
def kron(arr1, arr2):
"""columnwise outer product, avoiding relocate elements.
"""
r1, c1 = arr1.shape
r2, c2 = arr2.shape
nrows, ncols = r1 * r2, c1 * c2
res = np.empty((nrows, ncols))
for idx1 in range(c1):
for idx2 in range(c2):
new_c = idx1 * c2 + idx2
temp = np.zeros((r2, r1))
temp_kron = scipy.linalg.blas.dger(
alpha=1.0, x=arr2[:, idx2], y=arr1[:, idx1], incx=1, incy=1,
a=temp)
res[:, new_c] = np.ravel(temp_kron, order='F')
return res
def kron2(arr1, arr2):
"""First outer product, then rearrange items.
"""
r1, c1 = arr1.shape
r2, c2 = arr2.shape
nrows, ncols = r1 * r2, c1 * c2
tmp = np.outer(arr2, arr1)
res = np.empty((nrows, ncols))
for idx in range(arr1.size):
for offset in range(c2):
orig = tmp[offset::c2, idx]
dest_coffset = idx % c1 * c2 + offset
dest_roffset = (idx // c1) * r2
res[dest_roffset:dest_roffset+r2, dest_coffset] = orig
return res
def kron3(arr1, arr2):
"""First outer product, then rearrange items.
"""
r1, c1 = arr1.shape
r2, c2 = arr2.shape
nrows, ncols = r1 * r2, c1 * c2
tmp = np.outer(np.ravel(arr2, 'F'), np.ravel(arr1, 'F'))
res = np.empty((nrows, ncols))
for idx in range(arr1.size):
for offset in range(c2):
orig_offset = offset * r2
orig = tmp[orig_offset:orig_offset+r2, idx]
dest_c = idx // r1 * c2 + offset
dest_r = idx % r1 * r2
res[dest_r:dest_r+r2, dest_c] = orig
return res
Based on this stackoverflow post I created a MeasureTime decorator. A natural benchmark would be to compare against numpy.kron. Below are my test functions:
#MeasureTime
def test_np_kron(arr1, arr2, number=1000):
for _ in range(number):
np.kron(arr1, arr2)
return
#MeasureTime
def test_kron(arr1, arr2, number=1000):
for _ in range(number):
kron(arr1, arr2)
#MeasureTime
def test_kron2(arr1, arr2, number=1000):
for _ in range(number):
kron2(arr2, arr1)
#MeasureTime
def test_kron3(arr1, arr2, number=1000):
for _ in range(number):
kron3(arr2, arr1)
Turned out that Numpy's kron function performances much better:
arr1 = np.array([[1,-4,7], [-2, 3, 3]], dtype=np.float64, order='F')
arr2 = np.array([[8, -9, -6, 5], [1, -3, -4, 7], [2, 8, -8, -3], [1, 2, -5, -1]], dtype=np.float64, order='F')
In [243]: test_np_kron(arr1, arr2, number=10000)
Out [243]: "test_np_kron": 0.19688990000577178s
In [244]: test_kron(arr1, arr2, number=10000)
Out [244]: "test_kron": 0.6094115000014426s
In [245]: test_kron2(arr1, arr2, number=10000)
Out [245]: "test_kron2": 0.5699560000066413s
In [246]: test_kron3(arr1, arr2, number=10000)
Out [246]: "test_kron3": 0.7134822000080021s
I would like to know why that is the case? Is that because Numpy's reshape method is much more performant than manually copying over stuff (although still using numpy)? I was puzzled, since otherwise, I was using np.outer / blas.dger as well. The only difference I recognized here was how we arranged the ending results.
How come NumPy's reshape perform this good?
Here is the link to NumPy 1.17 kron source.
Updates:
Forgot to mention in the first place that I was trying to prototype in python, and then implement kron using C++ with cblas/lapack. Had some existing 'kron' needing to be refactored. I then came across Numpy's reshape and got really impressed.
Thanks in advance for your time!
Let's experiment with 2 small arrays:
In [124]: A, B = np.array([[1,2],[3,4]]), np.array([[10,11],[12,13]])
kron produces:
In [125]: np.kron(A,B)
Out[125]:
array([[10, 11, 20, 22],
[12, 13, 24, 26],
[30, 33, 40, 44],
[36, 39, 48, 52]])
outer produces the same numbers, but with a different arangement:
In [126]: np.outer(A,B)
Out[126]:
array([[10, 11, 12, 13],
[20, 22, 24, 26],
[30, 33, 36, 39],
[40, 44, 48, 52]])
kron reshapes it to a combination of the shapes of A and B:
In [127]: np.outer(A,B).reshape(2,2,2,2)
Out[127]:
array([[[[10, 11],
[12, 13]],
[[20, 22],
[24, 26]]],
[[[30, 33],
[36, 39]],
[[40, 44],
[48, 52]]]])
it then recombines 4 dimensions into 2 with concatenate:
In [128]: np.concatenate(np.concatenate(_127, 1),1)
Out[128]:
array([[10, 11, 20, 22],
[12, 13, 24, 26],
[30, 33, 40, 44],
[36, 39, 48, 52]])
An alternative is to swap axes, and reshape:
In [129]: _127.transpose(0,2,1,3).reshape(4,4)
Out[129]:
array([[10, 11, 20, 22],
[12, 13, 24, 26],
[30, 33, 40, 44],
[36, 39, 48, 52]])
The first reshape and transpose produce a view, but the second reshape has to produce a copy. Concatenate makes a copy. But all those actions are done in compiled numpy code.
Defining functions:
def foo1(A,B):
temp = np.outer(A,B)
temp = temp.reshape(A.shape + B.shape)
return np.concatenate(np.concatenate(temp, 1), 1)
def foo2(A,B):
temp = np.outer(A,B)
nz = temp.shape
temp = temp.reshape(A.shape + B.shape)
return temp.transpose(0,2,1,3).reshape(nz)
testing:
In [141]: np.allclose(np.kron(A,B), foo1(A,B))
Out[141]: True
In [142]: np.allclose(np.kron(A,B), foo2(A,B))
Out[142]: True
timing:
In [143]: timeit np.kron(A,B)
42.4 µs ± 294 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [145]: timeit foo1(A,B)
26.3 µs ± 38.6 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [146]: timeit foo2(A,B)
13.8 µs ± 19.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
My code may need some generalization, but it demonstrates the validity of the approach.
===
With your kron:
In [150]: kron(A,B)
Out[150]:
array([[10., 11., 20., 22.],
[12., 13., 24., 26.],
[30., 33., 40., 44.],
[36., 39., 48., 52.]])
In [151]: timeit kron(A,B)
55.3 µs ± 1.59 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
edit
einsum can do both the outer and transpose:
In [265]: np.einsum('ij,kl->ikjl',A,B).reshape(4,4)
Out[265]:
array([[10, 11, 20, 22],
[12, 13, 24, 26],
[30, 33, 40, 44],
[36, 39, 48, 52]])
In [266]: timeit np.einsum('ij,kl->ikjl',A,B).reshape(4,4)
9.87 µs ± 33 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

Resources