Related
This is a dput()sample of my data:
structure(list(ID = c("101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101"), IDA = c("1000",
"1279", "1392", "534", "835", "910", "748", "589", "675", "500",
"1243", "635", "1181", "791", "755", "1069", "640", "1229", "1856",
"116", "767", "1126", "863", "1141", "1858", "899", "5", "225",
"175", "1764", "1017", "497", "771", "41", "816", "1046", "439",
"930", "1350", "641", "1057", "1021", "503", "553", "1738", "1379",
"774", "442", "1113", "1503"), DATE = structure(c(1497315600,
1552352400, 1552957200, 1390438800, 1439427600, 1479776400, 1455757200,
1402534800, 1409187600, 1383008400, 1536022800, 1414630800, 1545094800,
1551142800, 1461805200, 1483405200, 1420506000, 1534813200, 1493600400,
1348448400, 1458176400, 1521075600, 1464656400, 1527555600, 1504573200,
1478134800, 1278378000, 1320886800, 1309395600, 1598576400, 1500512400,
1385600400, 1436403600, 1284426000, 1430960400, 1485824400, 1381971600,
1477962000, 1510023600, 1420509600, 1508806800, 1499302800, 1386205200,
1379466000, 1555290000, 1565226000, 1435798800, 1494896400, 1516064400,
1593478800), tzone = "UTC", class = c("POSIXct", "POSIXt")),
NR = c("CH-1000", " CH-1279", "CH-1392",
"CH-0534", "CH-0835", " CH-0910", "CH-0748",
"CH-0589", "CH-0675", "CH-0500", "CH-1243",
"CH-0635", "CH-1181", "CH-0791", "CH-0755",
"CH-1069", "CH-0640", "CH-1229", "CH-1856",
"CH-0116", "CH-0767", "CH-1126", "CH-0863",
"CH-1141", "CH-1858", "CH-0899", "CH-0005",
"CH-0225", "CH-0175", "CH-1764", "CH-1017",
"CH-0497", "CH-0771", "CH-0041", "CH-0816",
"CH-1046", "CH-0439", "CH-0930", "CH-1350",
"CH-0641", "CH-1057", "CH-1021", "CH-0503",
"CH-0553", "CH-1738", "CH-1379", "CH-0774",
"CH-0442", "CH-1113", "CH-1503"), PAT = c("101-1000",
"101-1279", "101-1392", "101-534", "101-835", "101-910",
"101-748", "101-589", "101-675", "101-500", "101-1243", "101-635",
"101-1181", "101-791", "101-755", "101-1069", "101-640",
"101-1229", "101-1856", "101-116", "101-767", "101-1126",
"101-863", "101-1141", "101-1858", "101-899", "101-5", "101-225",
"101-175", "101-1764", "101-1017", "101-497", "101-771",
"101-41", "101-816", "101-1046", "101-439", "101-930", "101-1350",
"101-641", "101-1057", "101-1021", "101-503", "101-553",
"101-1738", "101-1379", "101-774", "101-442", "101-1113",
"101-1503"), INT1 = c(NA, NA, NA, 280035, 280040, NA,
280040, 280040, 285030, 245040, NA, 280035, NA, NA, 280040,
NA, 220035, NA, NA, 280040, 280040, NA, 245005, NA, NA, 275005,
240070, 220035, 280040, NA, NA, 280040, 280040, 240005, 280040,
NA, 280040, 240005, 245040, 240030, NA, NA, 260010, NA, NA,
NA, 280040, NA, NA, NA), INT2 = c(NA, NA, NA, NA, NA,
NA, NA, 280040, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 240030, NA, 260005, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), INT3 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), INT4 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX1 = c(NA, 280005, 220035, NA, NA, NA,
NA, NA, NA, NA, 280050, NA, 240080, 280050, NA, 240085, NA,
280050, 270010, NA, NA, 280050, NA, 280005, NA, NA, NA, NA,
NA, 275045, 280050, NA, NA, NA, NA, 245005, NA, NA, 245040,
NA, NA, 280050, NA, NA, 220035, 280050, NA, 255005, 280050,
220005), INTX2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 240085, NA, NA, NA, NA, NA, NA, NA, NA, 280050,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), INTX3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX4 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), KAT = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))
What I needed to do first was
first, to recode some values from INT1:INT4 and INTX1:INTX4 and put them in new columns. To this aim, I used long_pivot as below:
longDATA <- DATA %>%
pivot_longer(cols = c('INT1':'INTX4'),
names_to = "INT", values_to = "Code")
Then I used the long list to mutate new variables as below:
longDATA1 <- longDATA %>% mutate(palm = case_when(Code == 210025 ~ 1))
longDATA2 <- longDATA1 %>% mutate(bio = case_when(Code == 210015 ~ '12.06.25',Code == 210020 ~ '12.07.25',Code == 275015 ~ '12.06.25',Code == 275020 ~ '12.07.25'))
longDATA3 <- longDATA2 %>% mutate(EPX = case_when(Code == 280005 ~ 1, Code == 280010 ~ 1))
Then I need to return it to wide format. I used the code below:
WideDATA <- longDATA3 %>% pivot_wider(names_from = INT, values_from = Code)
Below you can see the output but it does not show the problem as I have a huge dataset. I realized the number of rows has changed from 2480 (my initial data before doing longer-pivot)to 2633 (the Wide data).I realized that the added rows are created when in one of the EPX, bio or palm columns a value has been recoded. for example if there is a row where INT1 and INT2 are filled with values other than NA (e.g., 280010 and 280040), EPX is assigned value 1, based on the mutation that we did earlier. However, EPX=1 and INT1=280010 appear in one row and 280040 appears in another row. like the photo below.
[1]: https://i.stack.imgur.com/jjEea.png
I have spent a week to figure this out but no luck. I would highly appreciate your advice.
structure(list(ID = c("101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101", "101",
"101", "101", "101", "101", "101", "101", "101", "101"), IDA = c("1000",
"1279", "1392", "534", "835", "910", "748", "589", "675", "500",
"1243", "635", "1181", "791", "755", "1069", "640", "1229", "1856",
"116", "767", "1126", "863", "1141", "1858", "899", "5", "225",
"175", "1764", "1017", "497", "771", "41", "816", "1046", "439",
"930", "1350", "641", "1057", "1021", "503", "553", "1738", "1379",
"774", "442", "1113", "1503"), DATE = structure(c(1497315600,
1552352400, 1552957200, 1390438800, 1439427600, 1479776400, 1455757200,
1402534800, 1409187600, 1383008400, 1536022800, 1414630800, 1545094800,
1551142800, 1461805200, 1483405200, 1420506000, 1534813200, 1493600400,
1348448400, 1458176400, 1521075600, 1464656400, 1527555600, 1504573200,
1478134800, 1278378000, 1320886800, 1309395600, 1598576400, 1500512400,
1385600400, 1436403600, 1284426000, 1430960400, 1485824400, 1381971600,
1477962000, 1510023600, 1420509600, 1508806800, 1499302800, 1386205200,
1379466000, 1555290000, 1565226000, 1435798800, 1494896400, 1516064400,
1593478800), tzone = "UTC", class = c("POSIXct", "POSIXt")),
NR = c("CH-1000", " CH-1279", "CH-1392",
"CH-0534", "CH-0835", " CH-0910", "CH-0748",
"CH-0589", "CH-0675", "CH-0500", "CH-1243",
"CH-0635", "CH-1181", "CH-0791", "CH-0755",
"CH-1069", "CH-0640", "CH-1229", "CH-1856",
"CH-0116", "CH-0767", "CH-1126", "CH-0863",
"CH-1141", "CH-1858", "CH-0899", "CH-0005",
"CH-0225", "CH-0175", "CH-1764", "CH-1017",
"CH-0497", "CH-0771", "CH-0041", "CH-0816",
"CH-1046", "CH-0439", "CH-0930", "CH-1350",
"CH-0641", "CH-1057", "CH-1021", "CH-0503",
"CH-0553", "CH-1738", "CH-1379", "CH-0774",
"CH-0442", "CH-1113", "CH-1503"), PAT = c("101-1000",
"101-1279", "101-1392", "101-534", "101-835", "101-910",
"101-748", "101-589", "101-675", "101-500", "101-1243", "101-635",
"101-1181", "101-791", "101-755", "101-1069", "101-640",
"101-1229", "101-1856", "101-116", "101-767", "101-1126",
"101-863", "101-1141", "101-1858", "101-899", "101-5", "101-225",
"101-175", "101-1764", "101-1017", "101-497", "101-771",
"101-41", "101-816", "101-1046", "101-439", "101-930", "101-1350",
"101-641", "101-1057", "101-1021", "101-503", "101-553",
"101-1738", "101-1379", "101-774", "101-442", "101-1113",
"101-1503"), palm = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), bio= c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), EPx = c(NA,
NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), INT1 = c(NA, NA, NA, 280035, 280040, NA,
280040, 280040, 285030, 245040, NA, 280035, NA, NA, 280040,
NA, 220035, NA, NA, 280040, 280040, NA, 245005, NA, NA, 275005,
240070, 220035, 280040, NA, NA, 280040, 280040, 240005, 280040,
NA, 280040, 240005, 245040, 240030, NA, NA, 260010, NA, NA,
NA, 280040, NA, NA, NA), INT2 = c(NA, NA, NA, NA, NA,
NA, NA, 280040, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 240030, NA, 260005, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), INT3 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), INT4 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX1 = c(NA, 280005, 220035, NA, NA, NA,
NA, NA, NA, NA, 280050, NA, 240080, 280050, NA, 240085, NA,
280050, 270010, NA, NA, 280050, NA, 280005, NA, NA, NA, NA,
NA, 275045, 280050, NA, NA, NA, NA, 245005, NA, NA, 245040,
NA, NA, 280050, NA, NA, 220035, 280050, NA, 255005, 280050,
220005), INTX2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 240085, NA, NA, NA, NA, NA, NA, NA, NA, 280050,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), INTX3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), INTX4 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), KAT = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))
I copied pasted your code and both the DATA and WideDATA objects have the same number of rows (10), just with the desired variables added in WideDATA. Its possible there is something in your full data set that is creating the additional rows. I see some NAs in the sample you shared, and its worth noting that case_when doesn't treat those specially. If you need them handled as NA explicitly, you need to include is.na in a conditional statement. If sharing the full data set isn't possible, maybe randomly sampling would help.
e.g. dput(sample_n(DATA, 50))
I'm adding this as an answer though I realize it may need further editing as I couldn't usefully share the following in a comment.
In the code below I mostly kept your examples intact, but put everything into a single piped chunk of code. It might be simpler to stick with if_else in cases where there's only a single condition, but there's no harm using case_when throughout if you prefer the syntax.
You'll note I include TRUE ~ ... in each. Without a way to evaluate general cases, you get NA whenever the explicit statements aren't true.
Hopefully this lets you spot what is happening in your full data set, and if not please continue to update the question.
library(tidyverse)
DATA <- structure(list(ID = c("101", "101", "101", "101", "101", "101","101", "101", "101", "101"), IDA = c("1", "1", "2", "3", "4","5", "5", "1859", "1860", "1861"), DATE = structure(c(1300928400,1277946000, 1277946000, 1278550800, 1278550800, 1453770000, 1329958800,1506474000, 1485133200, 1485133200), tzone = "UTC", class = c("POSIXct","POSIXt")), NR = c("CH-0001", "CH-0001","CH-0002", "CH-0003", "CH-0004", "CH-0005","CH-0005", "CH-1859", "CH-1860", "CH-1861"), PAT = c("101-1", "101-1", "101-2", "101-3", "101-4", "101-5","101-5", "101-1859", "101-1860", "101-1861"), INT1 = c(245005,280040, 280040, 280040, 280040, 240040, 240040, NA, NA, NA),INT2 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), INT3 = c(NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_), INT4 = c(NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_), INTX1 = c(NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,NA_real_), INTX2 = c(NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), INTX173 = c(NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), INTX4 = c(NA_real_, NA_real_, NA_real_, NA_real_,NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), KAT = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1)), row.names = c(NA,-10L), class = c("tbl_df", "tbl", "data.frame"))
longDATA <- DATA %>%
pivot_longer(cols = c('INT1':'INTX4'),
names_to = "INT", values_to = "Code") %>%
mutate(
palm = case_when(
Code == 210025 ~ 1,
TRUE ~ 0),
bio = case_when(
Code == 210015 ~ '12.06.25',
Code == 210020 ~ '12.07.25',
Code == 275015 ~ '12.06.25',
Code == 275020 ~ '12.07.25',
TRUE ~ ''
),
EPX = case_when(
Code == 280005 ~ 1,
Code == 280010 ~ 1,
TRUE ~ 0
)
) %>%
pivot_wider(names_from = INT,
values_from = Code)
longDATA
#> # A tibble: 10 × 17
#> ID IDA DATE NR PAT KAT palm bio EPX INT1
#> <chr> <chr> <dttm> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 101 1 2011-03-24 01:00:00 CH-0001 101-1 0 0 "" 0 245005
#> 2 101 1 2010-07-01 01:00:00 CH-0001 101-1 0 0 "" 0 280040
#> 3 101 2 2010-07-01 01:00:00 CH-0002 101-2 0 0 "" 0 280040
#> 4 101 3 2010-07-08 01:00:00 CH-0003 101-3 0 0 "" 0 280040
#> 5 101 4 2010-07-08 01:00:00 CH-0004 101-4 0 0 "" 0 280040
#> 6 101 5 2016-01-26 01:00:00 CH-0005 101-5 0 0 "" 0 240040
#> 7 101 5 2012-02-23 01:00:00 CH-0005 101-5 0 0 "" 0 240040
#> 8 101 1859 2017-09-27 01:00:00 CH-1859 101-1… 1 0 "" 0 NA
#> 9 101 1860 2017-01-23 01:00:00 CH-1860 101-1… 1 0 "" 0 NA
#> 10 101 1861 2017-01-23 01:00:00 CH-1861 101-1… 1 0 "" 0 NA
#> # … with 7 more variables: INT2 <dbl>, INT3 <dbl>, INT4 <dbl>, INTX1 <dbl>,
#> # INTX2 <dbl>, INTX173 <dbl>, INTX4 <dbl>
Created on 2022-12-12 with reprex v2.0.2
Try this, check where I noted #fix
library(dplyr)
library(tidyr)
longDATA <- DATA %>%
pivot_longer(cols = c('INT1':'INTX4'),
names_to = "INT", values_to = "Code")
longDATA1 <- longDATA %>% mutate(palm = case_when(Code == 210025 ~ 1,
TRUE ~ NA_real_)) #fix
longDATA2 <- longDATA1 %>% mutate(bio = case_when(Code == 210015 ~ '12.06.25',
Code == 210020 ~ '12.07.25',
Code == 275015 ~ '12.06.25',
Code == 275020 ~ '12.07.25',
TRUE ~ NA_character_))#fix
longDATA3 <- longDATA2 %>% mutate(EPX = case_when(Code == 280005 ~ 1,
Code == 280010 ~ 1,
TRUE ~ NA_real_))#fix
WideDATA <- longDATA3 %>% pivot_wider(id_cols = ID:KAT, #fix
names_from = INT, values_from = Code)
#########
# Check #
#########
nrow(DATA)
#> [1] 50
nrow(WideDATA)
#> [1] 50
I have a shiny application with a login interface. I need to subset data based on the user login. Within the data, I have a column which marks the data whether the data belongs to them. The column is called "memberstate" and essentially contains the login username. The code I am tyring to use is borrowed from R Studio and is as follows:
user <- reactive({
session$user
})
### Code to manage row level security
isManager <- reactive({
if (user() == "manager"){
return(TRUE)
} else{
return(FALSE)
}
})
# Based on the logged in user, pull out only the data this user
# should be able to see.
data <- read.csv("data/DHA&PPTabletsClean.csv")
myData <- reactive({
if (isManager()){
# If a manager, show everything.
return(data)
} else{
# If a Member State, only show their own data.
return(data[data$memberstate == user(),])
}
})
I then try and use MyData for plotting graphs. I am getting the following error message
"Error in Mydata: could not find function "Mydata" " I am a newbie to R Shiny. Kindly assist.
Part of the data is as follows:
dput(data)
structure(list(Brand = c("Malaril", "Malaril", "Malaril", "Malaril",
"Malaril", "Malaril", "Malaril", "Malaril", "Malaril", "Malaril",
"Malaril", "Malaril", "Malaril", "Malaril", "Malaril", "Malaril",
"Malaril", "Malaril", "Malaril", "Malaril", "Malaril", "Malaril",
"Malaril", "Malaril", "Malaril", "Malaril", "Malaril", "Malaril",
"Malaril", "Malaril", "Malaril", "Malaril", "Malaril", "Malaril"
), ActiveIngredient = c("Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Piperaquine Phosphate", "Piperaquine Phosphate",
"Piperaquine Phosphate", "Piperaquine Phosphate", "Piperaquine Phosphate",
"Piperaquine Phosphate", "Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Piperaquine Phosphate", "Piperaquine Phosphate",
"Piperaquine Phosphate", "Piperaquine Phosphate", "Piperaquine Phosphate",
"Piperaquine Phosphate", "Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Dihydroartemisinin", "Dihydroartemisinin",
"Dihydroartemisinin", "Piperaquine Phosphate", "Piperaquine Phosphate",
"Piperaquine Phosphate", "Piperaquine Phosphate"), Assay = c(94.9,
94.9, 94.9, 94.9, 94.9, 94.9, 101.6, 101.6, 101.6, 101.6, 101.6,
101.6, 95, 95, 95, 95, 95, 95, 100.2, 100.2, 100.2, 100.2, 100.2,
100.2, 96.4, 96.4, 96.4, 96.4, 96.4, 96.4, 100.6, 100.6, 100.6,
100.6), Assayperc = c(0.949, 0.949, 0.949, 0.949, 0.949, 0.949,
1.016, 1.016, 1.016, 1.016, 1.016, 1.016, 0.95, 0.95, 0.95, 0.95,
0.95, 0.95, 1.002, 1.002, 1.002, 1.002, 1.002, 1.002, 0.965,
0.965, 0.965, 0.965, 0.965, 0.965, 1.006, 1.006, 1.006, 1.006
), AssayLL = c(90L, 90L, 90L, 90L, 90L, 90L, 93L, 93L, 93L, 93L,
93L, 93L, 90L, 90L, 90L, 90L, 90L, 90L, 93L, 93L, 93L, 93L, 93L,
93L, 90L, 90L, 90L, 90L, 90L, 90L, 93L, 93L, 93L, 93L), AssaypercLL = c(0.9,
0.9, 0.9, 0.9, 0.9, 0.9, 0.93, 0.93, 0.93, 0.93, 0.93, 0.93,
0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.93, 0.93, 0.93, 0.93, 0.93, 0.93,
0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.93, 0.93, 0.93, 0.93), AssayUL = c(110L,
110L, 110L, 110L, 110L, 110L, 107L, 107L, 107L, 107L, 107L, 107L,
110L, 110L, 110L, 110L, 110L, 110L, 107L, 107L, 107L, 107L, 107L,
107L, 110L, 110L, 110L, 110L, 110L, 110L, 107L, 107L, 107L, 107L
), AssaypercUL = c(1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.07, 1.07,
1.07, 1.07, 1.07, 1.07, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.07, 1.07,
1.07, 1.07, 1.07, 1.07, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.07, 1.07,
1.07, 1.07), DateManufacture = c("1/10/2017", "1/10/2017", "1/10/2017",
"1/10/2017", "1/10/2017", "1/10/2017", "1/10/2017", "1/10/2017",
"1/10/2017", "1/10/2017", "1/10/2017", "1/10/2017", "1/6/2018",
"1/6/2018", "1/6/2018", "1/6/2018", "1/6/2018", "1/6/2018", "1/6/2018",
"1/6/2018", "1/6/2018", "1/6/2018", "1/6/2018", "1/6/2018", "1/8/2018",
"1/8/2018", "1/8/2018", "1/8/2018", "1/8/2018", "1/8/2018", "1/8/2018",
"1/8/2018", "1/8/2018", "1/8/2018"), ExpiryDate = c("1/9/2019",
"1/9/2019", "1/9/2019", "1/9/2019", "1/9/2019", "1/9/2019", "1/9/2019",
"1/9/2019", "1/9/2019", "1/9/2019", "1/9/2019", "1/9/2019", "1/5/2020",
"1/5/2020", "1/5/2020", "1/5/2020", "1/5/2020", "1/5/2020", "1/5/2020",
"1/5/2020", "1/5/2020", "1/5/2020", "1/5/2020", "1/5/2020", "1/7/2020",
"1/7/2020", "1/7/2020", "1/7/2020", "1/7/2020", "1/7/2020", "1/7/2020",
"1/7/2020", "1/7/2020", "1/7/2020"), ShelfLifeYrs = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), ShelfLifeDysRecpt = c(-547L, -547L, -547L, -547L, -547L, -547L,
-547L, -547L, -547L, -547L, -547L, -547L, -304L, -304L, -304L,
-304L, -304L, -304L, -304L, -304L, -304L, -304L, -304L, -304L,
-243L, -243L, -243L, -243L, -243L, -243L, -243L, -243L, -243L,
-243L), DateReceiptSample = c("1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021", "1/3/2021",
"1/3/2021"), COADateIssue = c("27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021", "27/5/2021",
"27/5/2021"), TestingOutcome = c("Pass", "Pass", "Pass", "Pass",
"Pass", "Pass", "Pass", "Pass", "Pass", "Pass", "Pass", "Pass",
"Fail", "Fail", "Fail", "Fail", "Fail", "Fail", "Fail", "Fail",
"Fail", "Fail", "Fail", "Fail", "Pass", "Pass", "Pass", "Pass",
"Pass", "Pass", "Pass", "Pass", "Pass", "Pass"), FailureReason = c("",
"", "", "", "", "", "", "", "", "", "", "", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "Dihydroartemisinin Dissolution",
"Dihydroartemisinin Dissolution", "", "", "", "", "", "", "",
"", "", ""), Dissolution = c(77L, 81L, 84L, 86L, 82L, 81L, 100L,
96L, 98L, 101L, 97L, 102L, 62L, 59L, 62L, 66L, 65L, 61L, 99L,
95L, 97L, 103L, 99L, 102L, 97L, 80L, 81L, 86L, 80L, 80L, 103L,
101L, 101L, 101L), Dissolutionperc = c(0.77, 0.81, 0.84, 0.86,
0.82, 0.81, 1, 0.96, 0.98, 1.01, 0.97, 1.02, 0.62, 0.59, 0.62,
0.66, 0.65, 0.61, 0.99, 0.95, 0.97, 1.03, 0.99, 1.02, 0.97, 0.8,
0.81, 0.86, 0.8, 0.8, 1.03, 1.01, 1.01, 1.01), DissolLL = c(70L,
70L, 70L, 70L, 70L, 70L, 80L, 80L, 80L, 80L, 80L, 80L, 70L, 70L,
70L, 70L, 70L, 70L, 80L, 80L, 80L, 80L, 80L, 80L, 70L, 70L, 70L,
70L, 70L, 70L, 80L, 80L, 80L, 80L), DissolutionpercLL = c(0.7,
0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.7, 0.7,
0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.7, 0.7, 0.7,
0.7, 0.7, 0.7, 0.8, 0.8, 0.8, 0.8), Mass = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), pH = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), Dosageform = c("Tablet", "Tablet", "Tablet", "Tablet", "Tablet",
"Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet",
"Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet",
"Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet",
"Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet", "Tablet",
"Tablet"), Therapeuticclass = c("Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial",
"Antimalarial", "Antimalarial", "Antimalarial", "Antimalarial"
), memberstate = c("", "", "", "", "", "ruvimbo", "ruvimbo",
"ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo",
"ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo", "ruvimbo",
"ruvimbo", "ruvimbo", "ruvimbo", "", "", "", "", "", "", "",
"", "", "", "", "")), class = "data.frame", row.names = c(NA,
-34L))
Regards
Chris
I am trying to run custom NER on my data using offset values. I tried to replicate using this link << https://huggingface.co/course/chapter7/2 >>
I keep getting the error
variable name:_name = "label" if "label" in features[0].keys() else "labels"
DATA BEFORE tokenize_and_align_labels FUNCTIONS
{'texts': ['WASHINGTON USA WA DRIVER LICENSE BESSETTE Lamma 4d DL 73235766 9 Class AM to Iss 22/03/2021 Ab Exp 07130/2021 DOB 2/28/21 1 BESSETTE 2 GERALD 8 6930 NE Grandview Blvd, keyport, WA 86494 073076 12 Restrictions A 9a End P 16 Hgt 5\'-04" 15 Sex F 18 Eyes BLU 5 DD 73235766900000000000 Gerald Bessette', ] }
tag_names': [
[
{'start': 281, 'end': 296, 'tag': 'PERSON_NAME', 'text': 'Gerald Bessette'},
{'start': 135, 'end': 141, 'tag': 'FIRST_NAME', 'text': 'GERALD'},
{'start': 124, 'end': 122, 'tag': 'LAST_NAME', 'text': 'BESSETTE'},
{'start': 81, 'end': 81, 'tag': 'ISSUE_DATE', 'text': '22/03/2021'},
{'start': 99, 'end': 109, 'tag': 'EXPIRY_DATE', 'text': '07130/2021'},
{'start': 114, 'end': 121, 'tag': 'DATE_OF_BIRTH', 'text': '2/28/21'},
{'start': 51, 'end': 59, 'tag': 'DRIVER_LICENSE_NUMBER', 'text': '73235766'},
{'start': 144, 'end': 185, 'tag': 'ADDRESS', 'text': '6930 NE Grandview Blvd, keyport, WA 86494'}
],
DATA AFTER tokenize_and_align_labels FUNCTIONS
{'input_ids':
[[0, 305, 8684, 2805, 9342, 10994, 26994, 42560, 39951, 163, 12147, 3935, 6433, 6887, 1916, 204, 417, 13925, 6521, 1922, 4390, 4280, 361,
4210, 3326, 7, 19285, 820, 73, 3933, 73, 844, 2146, 2060, 12806, 321, 5339, 541, 73, 844, 2146, 14010, 387, 132, 73, 2517, 73, 2146, 112,
163, 12147, 3935, 6433, 132, 272, 39243, 495, 290, 5913, 541, 12462, 2374, 5877, 12543, 6, 762, 3427, 6, 9342, 290, 4027, 6405, 13470, 541,
5067, 316, 40950, 2485, 83, 361, 102, 4680, 221, 545, 289, 19377, 195, 32269, 3387, 113, 379, 15516, 274, 504, 26945, 12413, 791, 195, 27932,
6521, 1922, 4390, 36400, 45947, 151, 14651, 163, 3361, 3398, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'attention_mask':
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'offset_mapping': [[(0, 0), (0, 1), (1, 10), (11, 14), (15, 17), (18, 20), (20, 24), (25, 28), (28, 32), (33, 34), (34, 37), (37, 39), (39, 41),
(42, 45), (45, 47), (48, 49), (49, 50), (51, 53), (54, 56), (56, 58), (58, 60), (60, 62), (63, 64), (65, 70), (71, 73),
(74, 76), (77, 80), (81, 83), (83, 84), (84, 86), (86, 87), (87, 89), (89, 91), (92, 94), (95, 98), (99, 100), (100, 102),
(102, 104), (104, 105), (105, 107), (107, 109), (110, 112), (112, 113), (114, 115), (115, 116), (116, 118), (118, 119),
(119, 121), (122, 123), (124, 125), (125, 128), (128, 130), (130, 132), (133, 134), (135, 136), (136, 140), (140, 141),
(142, 143), (144, 146), (146, 148), (149, 151), (152, 157), (157, 161), (162, 166), (166, 167), (168, 171), (171, 175),
(175, 176), (177, 179), (180, 181), (181, 183), (183, 185), (186, 188), (188, 190), (190, 192), (193, 195), (196, 204),
(204, 208), (209, 210), (211, 212), (212, 213), (214, 217), (218, 219), (220, 222), (223, 224), (224, 226), (227, 228),
(228, 230), (230, 232), (232, 233), (234, 236), (237, 240), (241, 242), (243, 245), (246, 250), (251, 253), (253, 254),
(255, 256), (257, 259), (260, 262), (262, 264), (264, 266), (266, 269), (269, 277), (277, 280), (281, 287), (288, 289),
(289, 292), (292, 296), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]
'labels': [[24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 2, 10, 10, 18, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 3, 11, 11, 11, 11, 19, 24, 24, 1, 9, 9, 9, 17, 24, 24, 24, 24, 24, 24, 4, 12, 20, 24, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
16, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 7, 15, 15, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24],
My Code:
import transformers
from transformers import AutoTokenizer
from transformers import AutoTokenizer,BertModel,BertTokenizer
from transformers import RobertaModel,RobertaConfig,RobertaForTokenClassification
from transformers import TrainingArguments, Trainer
# from transformers.trainer import get_tpu_sampler
from transformers.trainer_pt_utils import get_tpu_sampler
from transformers.data.data_collator import DataCollator, InputDataClass
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
import torch
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler
from torchcrf import CRF
import dataclasses
import logging
import warnings
import tqdm
import os
import numpy as np
from typing import List, Union, Dict
os.environ["WANDB_DISABLED"] = "true"
print(transformers.__version__)
import evaluate
metric = evaluate.load("seqeval")
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) #add_prefix_space=True
def isin(a, b):
return a[1] > b[0] and a[0] < b[1]
def tokenize_and_align_labels(examples, label2id, max_length=256):
tokenized_inputs = tokenizer(examples["texts"], truncation=True, padding='max_length', max_length=max_length,return_offsets_mapping=True)
print("tokenization done")
labels = []
for i, label_idx_for_single_input in enumerate(tqdm.tqdm(examples["tag_names"])):
# print(i,label_idx_for_single_input)
labels_for_single_input = ['O' for _ in range(max_length)]
# print(labels_for_single_input)
text_offsets = tokenized_inputs['offset_mapping'][i]
# print("text_offsets",text_offsets)
for entity in label_idx_for_single_input:
# print("entity",entity)
tag = entity['tag']
# print("tag",tag)
tag_offset = [entity['start'], entity['end']]
# print("tag_offset",tag_offset)
# text_offsets [(0, 0), (0, 1), (1, 10), (11, 14), (15, 17), (18, 20), (20, 24), (25, 28), (28, 32), (33, 34), (34, 37), (37, 39), (39, 41), (42, 45), (45, 47), (48, 49), (49, 50), (51, 53), (54, 56), (56, 58), (58, 60), (60, 62), (63, 64), (65, 70), (71, 73), (74, 76), (77, 80), (81, 83), (83, 84), (84, 86), (86, 87), (87, 89), (89, 91), (92, 94), (95, 98), (99, 100), (100, 102), (102, 104), (104, 105), (105, 107), (107, 109), (110, 112), (112, 113), (114, 115), (115, 116), (116, 118), (118, 119), (119, 121), (122, 123), (124, 125), (125, 128), (128, 130), (130, 132), (133, 134), (135, 136), (136, 140), (140, 141), (142, 143), (144, 146), (146, 148), (149, 151), (152, 157), (157, 161), (162, 166), (166, 167), (168, 171), (171, 175), (175, 176), (177, 179), (180, 181), (181, 183), (183, 185), (186, 188), (188, 190), (190, 192), (193, 195), (196, 204), (204, 208), (209, 210), (211, 212), (212, 213), (214, 217), (218, 219), (220, 222), (223, 224), (224, 226), (227, 228), (228, 230), (230, 232), (232, 233), (234, 236), (237, 240), (241, 242), (243, 245), (246, 250), (251, 253), (253, 254), (255, 256), (257, 259), (260, 262), (262, 264), (264, 266), (266, 269), (269, 277), (277, 280), (281, 287), (288, 289), (289, 292), (292, 296), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]
# entity {'start': 281, 'end': 296, 'tag': 'PERSON_NAME', 'text': 'Gerald Bessette'}
# tag PERSON_NAME
# tag_offset [281, 296]
affected_token_ids = [j for j in range(max_length) if isin(tag_offset, text_offsets[j])]
# print("affected_token_ids",affected_token_ids)
if len(affected_token_ids) < 1:
# print('affected_token_ids)<1')
continue
if any(labels_for_single_input[j] != 'O' for j in affected_token_ids):
# print('entity orverlap! skipping')
continue
for j in affected_token_ids:
labels_for_single_input[j] = 'I_' + tag
labels_for_single_input[affected_token_ids[-1]] = 'L_' + tag
labels_for_single_input[affected_token_ids[0]] = 'B_' + tag
label_ids = [label2id[x] for x in labels_for_single_input]
labels.append(label_ids)
tokenized_inputs["labels"] = labels
# print(tokenized_inputs.keys())
return tokenized_inputs
import json
data = []
with open('data.json', 'r') as f:
for line in f:
data.append(json.loads(line))
l = []
for k, v in data[0].items():
l.append({'text': k, 'spans': v})
train_set = [
[
x['text'],
[{'start': y["start"], 'end': y["end"], 'tag': y["label"], 'text': y["ngram"]} for y in x['spans']]
] for x in l
]
## count labels in dataset
from collections import Counter
e = []
for x in train_set:
for y in x[1]:
e.append(y['tag'])
Counter(e).most_common()
## get label list
ori_label_list = []
for line in train_set:
ori_label_list += [entity['tag'] for entity in line[1]]
ori_label_list = sorted(list(set(ori_label_list)))
label_list = []
for prefix in 'BIL':
label_list += [prefix + '_' + x for x in ori_label_list]
label_list += ['O']
label_list = sorted(list(set(label_list)))
print(label_list)
print(len(label_list))
label2id = {n:i for i,n in enumerate(label_list)}
id2label= {str(i):n for i,n in enumerate(label_list)}
# id2label = {str(i): label for i, label in enumerate(label_names)}
# label2id = {v: k for k, v in id2label.items()}
train_examples ={'texts':[x[0] for x in train_set],'tag_names':[x[1] for x in train_set]}
train_examples = tokenize_and_align_labels(train_examples,label2id)
# train_examples = train_examples.map(tokenize_and_align_labels(label2id),batched=True)
print("here")
print(train_examples.keys())
print(len(train_examples['labels']))
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'])
# 775
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
# collator=data_collator(train_examples)
# def compute_metrics(eval_preds):
# logits, labels = eval_preds
# predictions = np.argmax(logits, axis=-1)
#
# # Remove ignored index (special tokens) and convert to labels
# true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
# true_predictions = [
# [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
# for prediction, label in zip(predictions, labels)
# ]
# all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
# return {
# "precision": all_metrics["overall_precision"],
# "recall": all_metrics["overall_recall"],
# "f1": all_metrics["overall_f1"],
# "accuracy": all_metrics["overall_accuracy"],
# }
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label,label2id=label2id,)
print(model.config.num_labels)
args = TrainingArguments(
"bert-finetuned-ner",
# evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=2,
weight_decay=0.01,
# push_to_hub=True,
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_examples,
# eval_dataset=train_examples,
data_collator=data_collator,
# compute_metrics=compute_metrics,
tokenizer=tokenizer)
trainer.train()
ERROR
_name = "label" if "label" in features[0].keys() else "labels"
AttributeError: 'tokenizers.Encoding' object has no attribute 'keys'
I think the object tokenized_inputs that you create and return in tokenize_and_align_labels is likely to be a tokenizers.Encoding object, not a dict or Dataset object (check this by printing type(myobject) when in doubt), and therefore it won't have keys.
You should apply your Tokenizer to your examples using the map function of Dataset, as in this example from the documentation.
I have tried iterate to explore pearson correlation between 2 variables with 7 more variables.
My df
df<-structure(list(d_peso1_v01 = structure(c(-8, -0.5, -13, -0.7,
-10.2, -9, -4.3, -1.6, 1.8, -11.3, -10.3, -4.6, 1.2, -2.8, -9.2
), format.spss = "F8.2", display_width = 13L), d_cintura1_v01 = structure(c(-6.5,
-3.5, -10, -2, -7, -3, -3, -4, -4.5, -9.5, -15.5, -3, 1, -4,
-12), format.spss = "F8.2", display_width = 16L), d_huglucagon_v01 = structure(c(-106.06,
NA, -75.38, 27.5, -325.38, -26.12, -104.26, 28.66, NA, -11.12,
-60.05, -76.38, -36.21, NA, -270.02), format.spss = "F8.2", display_width = 18L),
d_huinsulin_v01 = structure(c(-26.29, NA, -143.44, -410.55,
84.51, -121.56, -52.36, -151.83, NA, -42, -43.69, -82.96,
-51.27, NA, -163.12), format.spss = "F8.2", display_width = 17L),
d_huvisfatin_v01 = structure(c(-541.93, NA, -750.38, -611.9,
0, 139.61, -343.58, -149.2, NA, -91.54, -212.47, -844.05,
-353.86, NA, -1749.96), format.spss = "F8.2", display_width = 18L),
d_hupai1_v01 = structure(c(-785.4, NA, 115.96, -867.31, -10.84,
-1634, -331.21, 396.05, NA, -424.5, -143.09, 429.39, 799.11,
NA, -633.44), format.spss = "F8.2", display_width = 13L),
d_hucpeptide_v01 = structure(c(-189.33, NA, -612.6, -1250.86,
110.03, -614.69, -119.31, -305.55, NA, -104.55, -38.74, -411.38,
-65.48, NA, -143.75), format.spss = "F8.2", display_width = 18L),
d_huleptin_v01 = structure(c(-3145.34, NA, -5038.03, -2069.79,
-357.79, -1004.4, -1253.38, 365.69, NA, -2102.93, -1454.6,
-3380.95, -760.69, NA, -6078.46), format.spss = "F8.2", display_width = 16L),
d_hughrelin_v01 = structure(c(-290.46, NA, -898.76, -726.4,
-217.49, 41.13, 93.89, 436.93, NA, 12.85, -221.54, -134,
-200.15, NA, 261.3), format.spss = "F8.2", display_width = 18L),
d_hba1c_v01 = structure(c(0.02, NA, -0.26, -0.17, -1.05,
-0.41, -0.47, -0.21, NA, -0.14, -0.14, -0.43, 0.61, NA, -1.33
), format.spss = "F8.2", display_width = 13L), grupo_int_v00 = structure(c(2L,
1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L), .Label = c("A",
"B"), label = "Grupo de intervención", class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
I have performed all the steps to do it for the whole database, but I want to subset according to grupo_int_v00, which is a factor differentiating 2 treatments (A or B)
my_data <- dat[, c("d_peso1_v01", "d_cintura1_v01", "d_huglucagon_v01", "d_huinsulin_v01", "d_huvisfatin_v01", "d_hupai1_v01", "d_hucpeptide_v01", "d_huleptin_v01", "d_hughrelin_v01", "d_hba1c_v01", "grupo_int_v00")]
peso_pearson_01 <- lapply(my_data, function(x) {cor.test(x, my_data$d_peso1_v01, method = "pearson")})
cintura_pearson_01 <- lapply(my_data, function(x) {cor.test(x, my_data$d_cintura1_v01, method = "pearson")})
cintura_peso_01 <- do.call(c, list(peso_pearson_01, cintura_pearson_01))
max <- max(sapply(cintura_peso_01, length))
cintura_peso_01 <- do.call(rbind, lapply(cintura_peso_01, function(z) c(z, rep(NA, max-length(z)))))
How can I insert grupo_int_v00 in the syntax?
peso_pearson_01 <- lapply(my_data, function(x) {subset(dat$grupo_int_v00 == "A"), cor.test(x, my_data$d_peso1_v01, method = "pearson")})
Error: inesperado ',' in "lapply(my_data, function(x) {subset(dat$grupo_int_v00 == "A"),"
Thank you!
I am using the below logic to calculate the average timedelta inside a python list.
from datetime import datetime,timedelta
def entry_rate(entry_timestamps):
entry_deltas = [d1-d2 for d1,
d2 in zip(entry_timestamps[1:], entry_timestamps[:-1])]
average_timedelta = (sum(entry_deltas, timedelta(
0)) / len(entry_deltas)).total_seconds() if len(entry_deltas) != 0 else -1.0
return average_timedelta
Here is the input to the function:-
entry_timestamps = [datetime.datetime(2019, 11, 13, 7, 36, 21), datetime.datetime(2019, 11, 13, 7, 40, 53), datetime.datetime(2019, 11, 13, 7, 45, 25), datetime.datetime(2019, 11, 13, 7, 49, 58), datetime.datetime(2019, 11, 13, 7, 54, 30), datetime.datetime(2019, 11, 13, 7, 58, 32), datetime.datetime(2019, 11, 13, 8, 2, 34), datetime.datetime(2019, 11, 13, 8, 6, 36), datetime.datetime(2019, 11, 13, 8, 10, 38), datetime.datetime(2019, 11, 13, 8, 14, 40), datetime.datetime(2019, 11, 13, 8, 18, 42), datetime.datetime(2019, 11, 13, 8, 22, 44), datetime.datetime(2019, 11, 13, 8, 26, 46), datetime.datetime(2019, 11, 13, 8, 29, 18)]
However, I am getting the below error.
in prepare_llog(llog_pd)
288
289 result_df['entry_rate'] =\
--> 290 result_df['entry_timestamps'].apply(lambda x: compute_entry_rate(x))
291
292
~/miniconda3/envs/jupyter_21f8c25de0/lib/python3.6/site-packages/pandas/core/series.py
in apply(self, func, convert_dtype, args, **kwds) 4043
else: 4044 values = self.astype(object).values
-> 4045 mapped = lib.map_infer(values, f, convert=convert_dtype) 4046 4047 if len(mapped) and
isinstance(mapped[0], Series):
pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()
in (x)
288
289 result_df['entry_rate'] =\
--> 290 result_df['entry_timestamps'].apply(lambda x: compute_entry_rate(x))
291
292
in
compute_entry_rate(entry_timestamps)
49
50 average_timedelta = (sum(entry_deltas, timedelta(
---> 51 0)) / len(entry_deltas)).total_seconds() if len(entry_deltas) != 0 else -1.0
52 return average_timedelta
53
TypeError: _() takes 1 positional argument but 2 were given
Can anyone please suggest how to fix this?
You can do this with pandas like this:
S = pd.Series([datetime.datetime(2019, 11, 13, 7, 36, 21),
datetime.datetime(2019, 11, 13, 7, 40, 53),
datetime.datetime(2019, 11, 13, 7, 45, 25),
datetime.datetime(2019, 11, 13, 7, 49, 58),
datetime.datetime(2019, 11, 13, 7, 54, 30),
datetime.datetime(2019, 11, 13, 7, 58, 32),
datetime.datetime(2019, 11, 13, 8, 2, 34),
datetime.datetime(2019, 11, 13, 8, 6, 36),
datetime.datetime(2019, 11, 13, 8, 10, 38),
datetime.datetime(2019, 11, 13, 8, 14, 40),
datetime.datetime(2019, 11, 13, 8, 18, 42),
datetime.datetime(2019, 11, 13, 8, 22, 44),
datetime.datetime(2019, 11, 13, 8, 26, 46),
datetime.datetime(2019, 11, 13, 8, 29, 18)])
S.diff().mean().total_seconds()
Output:
244.384615384
Where as your function, entry_rate returns, without errors.:
entry_rate(entry_timestamps)
Output:
244.384615
Timings:
Using entry_rate:
5.16 µs ± 193 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Using pandas functions:
389 µs ± 22.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops
each)