Error message when joining two datasets

Hi there,
I’m an Epidemiologist working on producing an epidemiological situation report on MERS outbreak. However, at the data cleaning stage, I got an error message while left-joining two datasets. I can’t seem to figure what’s wrong, even after consulting the help function in R and the Epi R Handbook.

Would be grateful for your assistance with this. Please see sample code below:

# Load packages
pacman::p_load(rio,      # import and export of dataset
               here,     # defining relative filepath
               janitor,  # data cleaning
               skimr,    # review imported dataset
               outbreaks,# to access dataset
               apyramid, # age-sex pyramid
               epikit,   # creating age categories
               datapasta, #recreating dataset
               reprex,   # creating reproducible examples
               lubridate,# cleaning of dates
               tidyverse # data manipulation, presentation
)

# Load/import data
data(package = "outbreaks")

mers_cases <- data.frame(
  stringsAsFactors = FALSE,
  id = c("SK_122", "SK_3", "SK_34", "SK_71", "SK_53"),
  dt_report = c("2015-06-10","2015-05-20",
                "2015-06-03","2015-06-07","2015-06-06"),
  dt_onset = c("2015-06-02","2015-05-20",
               "2015-05-20","2015-06-04","2015-05-29"),
  dt_death = c(NA, "2015-06-04", NA, NA, NA),
  dt_diag = c("2015-06-10","2015-05-21",
              "2015-06-04","2015-06-07","2015-06-06")
)

mers_contacts <-  data.frame(
  diff_dt_onset = c(15L, 22L, 3L, 14L, 14L),
  exposure = as.factor(c("Hospital room",
                         "Hospital room","Hospital room","Hospital room",
                         "Emergency room"))
)

# Data cleaning
mers_clean <- data.frame(
  stringsAsFactors = FALSE,
  id = c("SK_122", "SK_3", "SK_34", "SK_71", "SK_53"),
  dt_report = c("2015-06-10","2015-05-20",
                "2015-06-03","2015-06-07","2015-06-06"),
  dt_onset = c("2015-06-02","2015-05-20",
               "2015-05-20","2015-06-04","2015-05-29"),
  dt_death = c(NA, "2015-06-04", NA, NA, NA),
  dt_diag = c("2015-06-10","2015-05-21",
              "2015-06-04","2015-06-07","2015-06-06")
) %>%  

  #renaming colnames
  rename(
         date_onset = dt_onset,
         date_report = dt_report,
         date_diag = dt_diag,
         date_death = dt_death 
         ) %>% 
  
  #creating new variables with important dates               
  mutate(diff_onset = date_report - date_onset,
         delay_rep = date_report - date_onset,
         delay_diag = date_diag - date_onset,
         time_to_death = date_death - date_onset)
#> Error in `mutate()`:
#> ℹ In argument: `diff_onset = date_report - date_onset`.
#> Caused by error in `date_report - date_onset`:
#> ! non-numeric argument to binary operator
#> Backtrace:
#>      ▆
#>   1. ├─... %>% ...
#>   2. ├─dplyr::mutate(...)
#>   3. ├─dplyr:::mutate.data.frame(...)
#>   4. │ └─dplyr:::mutate_cols(.data, dplyr_quosures(...), by)
#>   5. │   ├─base::withCallingHandlers(...)
#>   6. │   └─dplyr:::mutate_col(dots[[i]], data, mask, new_columns)
#>   7. │     └─mask$eval_all_mutate(quo)
#>   8. │       └─dplyr (local) eval()
#>   9. └─base::.handleSimpleError(...)
#>  10.   └─dplyr (local) h(simpleError(msg, call))
#>  11.     └─rlang::abort(message, class = error_class, parent = parent, call = error_call)
         
  
#Joining contacts dataset

joined <-mers_clean %>%  #baseline dataset
         inner_join(data.frame(
           diff_dt_onset = c(15L, 22L, 3L, 14L, 14L),
           exposure = as.factor(c("Hospital room",
                                  "Hospital room","Hospital room","Hospital room",
                                  "Emergency room"))
         ), # dataset to be joined
                    by = c(diff_onset = "diff_dt_onset")) 
#> Error in eval(expr, envir, enclos): object 'mers_clean' not found

F.I

1 Like

Hello,

You cannot perform subtraction on your dates since they have a type of character, you first must convert them to a date type, the lubridate package offers some tools for this. Once you do, you will need to convert the differences of dates into numerics to match what you have in your contact data. Please see below for an example:

# loading packages
library(tidyverse)

# creating fake data
mers_cases <- data.frame(
  stringsAsFactors = FALSE,
  id = c("SK_122", "SK_3", "SK_34", "SK_71", "SK_53"),
  dt_report = c(
    "2015-06-10", "2015-05-20",
    "2015-06-03", "2015-06-07", "2015-06-06"
  ),
  dt_onset = c(
    "2015-06-02", "2015-05-20",
    "2015-05-20", "2015-06-04", "2015-05-29"
  ),
  dt_death = c(NA, "2015-06-04", NA, NA, NA),
  dt_diag = c(
    "2015-06-10", "2015-05-21",
    "2015-06-04", "2015-06-07", "2015-06-06"
  )
)

mers_contacts <- data.frame(
  diff_dt_onset = c(15L, 22L, 3L, 14L, 14L),
  exposure = as.factor(c(
    "Hospital room",
    "Hospital room", "Hospital room", "Hospital room",
    "Emergency room"
  ))
)

# cleaning data
mers_cases_clean <- mers_cases |>
  rename(
    date_onset = dt_onset,
    date_report = dt_report,
    date_diag = dt_diag,
    date_death = dt_death
  ) |>
  mutate(across(.cols = contains("date"), .fns = ymd),
    diff_onset = as.numeric(date_report - date_onset, "days"),
    delay_rep = as.numeric(date_report - date_onset, "days"),
    delay_diag = as.numeric(date_diag - date_onset, "days"),
    time_to_death = as.numeric(date_death - date_onset, "days")
  )

# joining data
joined <- mers_cases_clean |>
    inner_join(mers_contacts,
    by = c(diff_onset = "diff_dt_onset"))

Created on 2024-03-09 with reprex v2.1.0

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.1 (2023-06-16)
#>  os       macOS Ventura 13.6.3
#>  system   x86_64, darwin20
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       America/Toronto
#>  date     2024-03-09
#>  pandoc   3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date (UTC) lib source
#>  cli           3.6.2   2023-12-11 [1] CRAN (R 4.3.0)
#>  colorspace    2.1-0   2023-01-23 [1] CRAN (R 4.3.0)
#>  digest        0.6.34  2024-01-11 [1] RSPM (R 4.3.0)
#>  dplyr       * 1.1.4   2023-11-17 [1] CRAN (R 4.3.0)
#>  evaluate      0.23    2023-11-01 [1] CRAN (R 4.3.0)
#>  fansi         1.0.6   2023-12-08 [1] CRAN (R 4.3.0)
#>  fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.0)
#>  forcats     * 1.0.0   2023-01-29 [1] CRAN (R 4.3.0)
#>  fs            1.6.3   2023-07-20 [1] CRAN (R 4.3.0)
#>  generics      0.1.3   2022-07-05 [1] CRAN (R 4.3.0)
#>  ggplot2     * 3.5.0   2024-02-23 [1] RSPM (R 4.3.0)
#>  glue          1.7.0   2024-01-09 [1] RSPM (R 4.3.0)
#>  gtable        0.3.4   2023-08-21 [1] CRAN (R 4.3.0)
#>  hms           1.1.3   2023-03-21 [1] CRAN (R 4.3.0)
#>  htmltools     0.5.7   2023-11-03 [1] CRAN (R 4.3.0)
#>  knitr         1.45    2023-10-30 [1] CRAN (R 4.3.0)
#>  lifecycle     1.0.4   2023-11-07 [1] CRAN (R 4.3.0)
#>  lubridate   * 1.9.3   2023-09-27 [1] CRAN (R 4.3.0)
#>  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.0)
#>  munsell       0.5.0   2018-06-12 [1] CRAN (R 4.3.0)
#>  pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.0)
#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.0)
#>  purrr       * 1.0.2   2023-08-10 [1] CRAN (R 4.3.0)
#>  R.cache       0.16.0  2022-07-21 [1] CRAN (R 4.3.0)
#>  R.methodsS3   1.8.2   2022-06-13 [1] CRAN (R 4.3.0)
#>  R.oo          1.26.0  2024-01-24 [1] RSPM (R 4.3.0)
#>  R.utils       2.12.3  2023-11-18 [1] CRAN (R 4.3.0)
#>  R6            2.5.1   2021-08-19 [1] CRAN (R 4.3.0)
#>  readr       * 2.1.5   2024-01-10 [1] RSPM (R 4.3.0)
#>  reprex        2.1.0   2024-01-11 [1] RSPM (R 4.3.0)
#>  rlang         1.1.3   2024-01-10 [1] RSPM (R 4.3.0)
#>  rmarkdown     2.25    2023-09-18 [1] CRAN (R 4.3.0)
#>  rstudioapi    0.15.0  2023-07-07 [1] CRAN (R 4.3.0)
#>  scales        1.3.0   2023-11-28 [1] CRAN (R 4.3.0)
#>  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.0)
#>  stringi       1.8.3   2023-12-11 [1] CRAN (R 4.3.0)
#>  stringr     * 1.5.1   2023-11-14 [1] CRAN (R 4.3.0)
#>  styler        1.10.2  2023-08-29 [1] CRAN (R 4.3.0)
#>  tibble      * 3.2.1   2023-03-20 [1] CRAN (R 4.3.0)
#>  tidyr       * 1.3.1   2024-01-24 [1] RSPM (R 4.3.0)
#>  tidyselect    1.2.0   2022-10-10 [1] CRAN (R 4.3.0)
#>  tidyverse   * 2.0.0   2023-02-22 [1] CRAN (R 4.3.0)
#>  timechange    0.3.0   2024-01-18 [1] RSPM (R 4.3.0)
#>  tzdb          0.4.0   2023-05-12 [1] CRAN (R 4.3.0)
#>  utf8          1.2.4   2023-10-22 [1] CRAN (R 4.3.0)
#>  vctrs         0.6.5   2023-12-01 [1] CRAN (R 4.3.0)
#>  withr         3.0.0   2024-01-16 [1] RSPM (R 4.3.0)
#>  xfun          0.42    2024-02-08 [1] RSPM (R 4.3.0)
#>  yaml          2.3.8   2023-12-11 [1] CRAN (R 4.3.0)
#> 
#>  [1] /Users/timothychisamore/Library/R/x86_64/4.3/library
#>  [2] /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────

All the best,

Tim

Hi Tim,

Ah I missed that. So, convert to date, then numeric class before performing arithmetic operations.

Thanks for your reply, this is very helpful.

Best wishes,
Friday

1 Like