Time to respond very long when I'm using forloop

aureolngako · July 5, 2023, 4:13pm

Hello Community,
This is the code I’m working on. The real dataset has more than 20000 rows. so when I run this code it takes hours (My machine have 24GO of RAM). How can I arrange this to be faster ? Or is there any other code I can use ? Thank you
Aureol

# Load the packages to be used -----------------------------------------------

pacman::p_load (tidyverse,here,rio,reprex)



# Generate the data without covid database -----------------------


demo_data <-   data.frame(
  region = c("Adamaoua",
             "Centre","Est",
             "Extreme Nord",
             "Littoral","Nord",
             "Nord-Ouest","Ouest","Sud",
             "Sud-Ouest",
             "Adamaoua","Centre",
             "Est","Extreme Nord",
             "Littoral","Nord",
             "Nord-Ouest",
             "Ouest","Sud",
             "Sud-Ouest","Adamaoua",
             "Centre","Est",
             "Extreme Nord","Littoral",
             "Nord","Nord-Ouest",
             "Ouest","Sud",
             "Sud-Ouest",
             "Adamaoua","Centre","Est",
             "Extreme Nord",
             "Littoral","Nord",
             "Nord-Ouest","Ouest",
             "Sud","Sud-Ouest",
             "Adamaoua","Centre",
             "Est",
             "Extreme Nord","Littoral",
             "Nord","Nord-Ouest",
             "Ouest","Sud",
             "Sud-Ouest"),
  date = c("2021-01-02",
           "2021-01-02",
           "2021-01-02",
           "2021-01-02","2021-01-02",
           "2021-01-02",
           "2021-01-02","2021-01-02",
           "2021-01-02",
           "2021-01-02","2021-01-03",
           "2021-01-03",
           "2021-01-03",
           "2021-01-03","2021-01-03",
           "2021-01-03",
           "2021-01-03","2021-01-03",
           "2021-01-03",
           "2021-01-03","2021-01-04",
           "2021-01-04",
           "2021-01-04",
           "2021-01-04","2021-01-04",
           "2021-01-04",
           "2021-01-04","2021-01-04",                                                                                                 "2021-01-04",
           "2021-01-04","2021-01-05",                                                                                                        "2021-01-05",
           "2021-01-05",                                                                                                        "2021-01-05","2021-01-05",
           "2021-01-05",
           "2021-01-05","2021-01-05",
           "2021-01-05",
           "2021-01-05","2021-01-06",
           "2021-01-06",
           "2021-01-06",
           "2021-01-06","2021-01-06",
           "2021-01-06",
           "2021-01-06","2021-01-06",
           "2021-01-06",
           "2021-01-06"),
  population = c(1345934,
                 4846001.81701392,
                 1146981,4824522.05380686,
                 3987222,
                 2964767.81627007,
                 2244288.13462958,2113367,
                 818190,1862687,1345934,
                 4846001.81701392,
                 1146981,
                 4824522.05380686,3987222,
                 2964767.81627007,
                 2244288.13462958,2113367,
                 818190,1862687,
                 1518189,4965861,
                 1360451,4967788,4277464,
                 2996271,1868031,
                 2327807,894878,
                 1899941,1518189,
                 4965861,1360451,4967788,
                 4277464,2996271,
                 1868031,2327807,
                 894878,1899941,
                 1518189,4965861,1360451,
                 4967788,4277464,
                 2996271,1868031,
                 2327807,894878,
                 1899941),
  cases = c(2,181,0,2,
            20,10,0,0,0,0,0,
            9,0,0,100,83,
            0,0,110,0,2,107,
            0,3,0,0,0,0,0,
            0,0,82,3,8,20,
            0,0,0,0,0,4,
            105,0,1,63,16,0,
            2,0,8),
  deaths = c(0,20,0,1,0,
             0,0,1,0,0,0,
             0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,
             0,0,0,0,0,0,
             0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,
             0,0,0,0,0)
)


## Function to compute the news cases and new deaths ---------------------------

for (i in c(1:nrow(demo_data))) {
  # Create the covid_subset and make sure that the date are ordered
  
  covid_subset <- demo_data[order(demo_data$date),]
  
  covid_subset$new_cases <- covid_subset$cases[1] 
  covid_subset$new_deaths <- covid_subset$deaths[1]
  
  # Compute the number of new cases and new deaths from the second row in the dataset
  
  for (j in 2:nrow(covid_subset)) {
    covid_subset$new_cases[j] = covid_subset$cases[j] - covid_subset$cases[j-1]
    covid_subset$new_deaths[j] = covid_subset$deaths[j] - covid_subset$deaths[j-1]
  }
  
  # set negative new case or death counts to 0 
  
  covid_subset$new_cases[covid_subset$new_cases<0] <-  0
  covid_subset$new_deaths[covid_subset$new_deaths<0] <-  0
  
  # Fold into the main data set 
  
  demo_data$new_cases[i]   <-  covid_subset$new_cases[i]
  demo_data$new_deaths[i] <-  covid_subset$new_deaths[i] 
  
}

^{Created on 2023-07-05 with reprex v2.0.2}

Session info

sessionInfo()
#> R version 4.3.1 (2023-06-16 ucrt)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 11 x64 (build 22000)
#> 
#> Matrix products: default
#> 
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.utf8 
#> [2] LC_CTYPE=English_United States.utf8   
#> [3] LC_MONETARY=English_United States.utf8
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.utf8    
#> 
#> time zone: America/Los_Angeles
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] reprex_2.0.2    rio_0.5.29      here_1.0.1      lubridate_1.9.2
#>  [5] forcats_1.0.0   stringr_1.5.0   dplyr_1.1.2     purrr_1.0.1    
#>  [9] readr_2.1.4     tidyr_1.3.0     tibble_3.2.1    ggplot2_3.4.2  
#> [13] tidyverse_2.0.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] utf8_1.2.3        generics_0.1.3    stringi_1.7.12    hms_1.1.3        
#>  [5] digest_0.6.32     magrittr_2.0.3    evaluate_0.21     grid_4.3.1       
#>  [9] timechange_0.2.0  fastmap_1.1.1     cellranger_1.1.0  rprojroot_2.0.3  
#> [13] zip_2.3.0         fansi_1.0.4       scales_1.2.1      cli_3.6.1        
#> [17] rlang_1.1.1       munsell_0.5.0     withr_2.5.0       yaml_2.3.7       
#> [21] tools_4.3.1       tzdb_0.4.0        colorspace_2.1-0  pacman_0.5.1     
#> [25] curl_5.0.1        vctrs_0.6.3       R6_2.5.1          lifecycle_1.0.3  
#> [29] fs_1.6.2          foreign_0.8-84    pkgconfig_2.0.3   pillar_1.9.0     
#> [33] openxlsx_4.2.5.2  gtable_0.3.3      glue_1.6.2        data.table_1.14.8
#> [37] Rcpp_1.0.10       haven_2.5.2       xfun_0.39         tidyselect_1.2.0 
#> [41] rstudioapi_0.14   knitr_1.43        htmltools_0.5.5   rmarkdown_2.23   
#> [45] compiler_4.3.1    readxl_1.4.2

machupovirus · July 5, 2023, 10:23pm

aureolngako:

demo_data <-   data.frame(
  region = c("Adamaoua",
             "Centre","Est",
             "Extreme Nord",
             "Littoral","Nord",
             "Nord-Ouest","Ouest","Sud",
             "Sud-Ouest",
             "Adamaoua","Centre",
             "Est","Extreme Nord",
             "Littoral","Nord",
             "Nord-Ouest",
             "Ouest","Sud",
             "Sud-Ouest","Adamaoua",
             "Centre","Est",
             "Extreme Nord","Littoral",
             "Nord","Nord-Ouest",
             "Ouest","Sud",
             "Sud-Ouest",
             "Adamaoua","Centre","Est",
             "Extreme Nord",
             "Littoral","Nord",
             "Nord-Ouest","Ouest",
             "Sud","Sud-Ouest",
             "Adamaoua","Centre",
             "Est",
             "Extreme Nord","Littoral",
             "Nord","Nord-Ouest",
             "Ouest","Sud",
             "Sud-Ouest"),
  date = c("2021-01-02",
           "2021-01-02",
           "2021-01-02",
           "2021-01-02","2021-01-02",
           "2021-01-02",
           "2021-01-02","2021-01-02",
           "2021-01-02",
           "2021-01-02","2021-01-03",
           "2021-01-03",
           "2021-01-03",
           "2021-01-03","2021-01-03",
           "2021-01-03",
           "2021-01-03","2021-01-03",
           "2021-01-03",
           "2021-01-03","2021-01-04",
           "2021-01-04",
           "2021-01-04",
           "2021-01-04","2021-01-04",
           "2021-01-04",
           "2021-01-04","2021-01-04",                                                                                                 "2021-01-04",
           "2021-01-04","2021-01-05",                                                                                                        "2021-01-05",
           "2021-01-05",                                                                                                        "2021-01-05","2021-01-05",
           "2021-01-05",
           "2021-01-05","2021-01-05",
           "2021-01-05",
           "2021-01-05","2021-01-06",
           "2021-01-06",
           "2021-01-06",
           "2021-01-06","2021-01-06",
           "2021-01-06",
           "2021-01-06","2021-01-06",
           "2021-01-06",
           "2021-01-06"),
  population = c(1345934,
                 4846001.81701392,
                 1146981,4824522.05380686,
                 3987222,
                 2964767.81627007,
                 2244288.13462958,2113367,
                 818190,1862687,1345934,
                 4846001.81701392,
                 1146981,
                 4824522.05380686,3987222,
                 2964767.81627007,
                 2244288.13462958,2113367,
                 818190,1862687,
                 1518189,4965861,
                 1360451,4967788,4277464,
                 2996271,1868031,
                 2327807,894878,
                 1899941,1518189,
                 4965861,1360451,4967788,
                 4277464,2996271,
                 1868031,2327807,
                 894878,1899941,
                 1518189,4965861,1360451,
                 4967788,4277464,
                 2996271,1868031,
                 2327807,894878,
                 1899941),
  cases = c(2,181,0,2,
            20,10,0,0,0,0,0,
            9,0,0,100,83,
            0,0,110,0,2,107,
            0,3,0,0,0,0,0,
            0,0,82,3,8,20,
            0,0,0,0,0,4,
            105,0,1,63,16,0,
            2,0,8),
  deaths = c(0,20,0,1,0,
             0,0,1,0,0,0,
             0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,
             0,0,0,0,0,0,
             0,0,0,0,0,0,0,
             0,0,0,0,0,0,0,
             0,0,0,0,0)
)

Hello,

Do you think you could briefly explain what you are trying to do with the data? I see you’ve provided some fake data and code which is great, but knowing a bit more about the problem itself would help when coming up with potential alternative approaches.

All the best,

Tim

aureolngako · July 8, 2023, 12:27pm

Thank you Tim. I’m trying to calculate new COVID-19 cases and deaths for a particular day, based on the previous day cases. The original database was unable to capture new cases and deaths by days.

machupovirus · July 8, 2023, 3:10pm

Hello,

In your data it appears that the cumulative counts are presented by date and location, however, you only sort by date in your code. This would mean that when you take differences it won’t necessarily be differencing the cumulative counts in the same location on successive days. Please let me know if I am understanding the problem correctly and I can provide some code based on how I would solve things.

All the best,

Tim

aureolngako · July 25, 2023, 10:50am

The cumulative total is only calculated by date here (National level), but the code written for calculation by location and date worked well. The function I used works well for a small data set, but takes longer to process for a large data set.
Thank you

machupovirus · July 26, 2023, 11:22pm

Hi,

The code you provided is differencing the cumulative counts by date only, however, you should be differencing by region and date if you are interested in deriving the incident counts by date and region.

Here is an example of how I would achieve this:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tibble)

demo_data <- data.frame(
  region = c(
    "Adamaoua",
    "Centre",
    "Est",
    "Extreme Nord",
    "Littoral",
    "Nord",
    "Nord-Ouest",
    "Ouest",
    "Sud",
    "Sud-Ouest",
    "Adamaoua",
    "Centre",
    "Est",
    "Extreme Nord",
    "Littoral",
    "Nord",
    "Nord-Ouest",
    "Ouest",
    "Sud",
    "Sud-Ouest",
    "Adamaoua",
    "Centre",
    "Est",
    "Extreme Nord",
    "Littoral",
    "Nord",
    "Nord-Ouest",
    "Ouest",
    "Sud",
    "Sud-Ouest",
    "Adamaoua",
    "Centre",
    "Est",
    "Extreme Nord",
    "Littoral",
    "Nord",
    "Nord-Ouest",
    "Ouest",
    "Sud",
    "Sud-Ouest",
    "Adamaoua",
    "Centre",
    "Est",
    "Extreme Nord",
    "Littoral",
    "Nord",
    "Nord-Ouest",
    "Ouest",
    "Sud",
    "Sud-Ouest"
  ),
  date = c(
    "2021-01-02",
    "2021-01-02",
    "2021-01-02",
    "2021-01-02",
    "2021-01-02",
    "2021-01-02",
    "2021-01-02",
    "2021-01-02",
    "2021-01-02",
    "2021-01-02",
    "2021-01-03",
    "2021-01-03",
    "2021-01-03",
    "2021-01-03",
    "2021-01-03",
    "2021-01-03",
    "2021-01-03",
    "2021-01-03",
    "2021-01-03",
    "2021-01-03",
    "2021-01-04",
    "2021-01-04",
    "2021-01-04",
    "2021-01-04",
    "2021-01-04",
    "2021-01-04",
    "2021-01-04",
    "2021-01-04",
    "2021-01-04",
    "2021-01-04",
    "2021-01-05",
    "2021-01-05",
    "2021-01-05",
    "2021-01-05",
    "2021-01-05",
    "2021-01-05",
    "2021-01-05",
    "2021-01-05",
    "2021-01-05",
    "2021-01-05",
    "2021-01-06",
    "2021-01-06",
    "2021-01-06",
    "2021-01-06",
    "2021-01-06",
    "2021-01-06",
    "2021-01-06",
    "2021-01-06",
    "2021-01-06",
    "2021-01-06"
  ),
  population = c(
    1345934,
    4846001.81701392,
    1146981,
    4824522.05380686,
    3987222,
    2964767.81627007,
    2244288.13462958,
    2113367,
    818190,
    1862687,
    1345934,
    4846001.81701392,
    1146981,
    4824522.05380686,
    3987222,
    2964767.81627007,
    2244288.13462958,
    2113367,
    818190,
    1862687,
    1518189,
    4965861,
    1360451,
    4967788,
    4277464,
    2996271,
    1868031,
    2327807,
    894878,
    1899941,
    1518189,
    4965861,
    1360451,
    4967788,
    4277464,
    2996271,
    1868031,
    2327807,
    894878,
    1899941,
    1518189,
    4965861,
    1360451,
    4967788,
    4277464,
    2996271,
    1868031,
    2327807,
    894878,
    1899941
  ),
  cases = c(
    2,
    181,
    0,
    2,
    20,
    10,
    0,
    0,
    0,
    0,
    0,
    9,
    0,
    0,
    100,
    83,
    0,
    0,
    110,
    0,
    2,
    107,
    0,
    3,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    82,
    3,
    8,
    20,
    0,
    0,
    0,
    0,
    0,
    4,
    105,
    0,
    1,
    63,
    16,
    0,
    2,
    0,
    8
  ),
  deaths = c(
    0,
    20,
    0,
    1,
    0,
    0,
    0,
    1,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0
  )
)

demo_data |>
  mutate(
    # differencing cases and deaths
    diff_cases = cases - lag(x = cases, n = 1L, default = 0, order_by = date),
    diff_deaths = deaths - lag(x = deaths, n = 1L, default = 0, order_by = date),
    # making negative values 0
    diff_cases = if_else(diff_cases < 0, 0, diff_cases),
    diff_deaths = if_else(diff_deaths < 0, 0, diff_deaths)
  )
#>          region       date population cases deaths diff_cases diff_deaths
#> 1      Adamaoua 2021-01-02    1345934     2      0          2           0
#> 2        Centre 2021-01-02    4846002   181     20        179          20
#> 3           Est 2021-01-02    1146981     0      0          0           0
#> 4  Extreme Nord 2021-01-02    4824522     2      1          2           1
#> 5      Littoral 2021-01-02    3987222    20      0         18           0
#> 6          Nord 2021-01-02    2964768    10      0          0           0
#> 7    Nord-Ouest 2021-01-02    2244288     0      0          0           0
#> 8         Ouest 2021-01-02    2113367     0      1          0           1
#> 9           Sud 2021-01-02     818190     0      0          0           0
#> 10    Sud-Ouest 2021-01-02    1862687     0      0          0           0
#> 11     Adamaoua 2021-01-03    1345934     0      0          0           0
#> 12       Centre 2021-01-03    4846002     9      0          9           0
#> 13          Est 2021-01-03    1146981     0      0          0           0
#> 14 Extreme Nord 2021-01-03    4824522     0      0          0           0
#> 15     Littoral 2021-01-03    3987222   100      0        100           0
#> 16         Nord 2021-01-03    2964768    83      0          0           0
#> 17   Nord-Ouest 2021-01-03    2244288     0      0          0           0
#> 18        Ouest 2021-01-03    2113367     0      0          0           0
#> 19          Sud 2021-01-03     818190   110      0        110           0
#> 20    Sud-Ouest 2021-01-03    1862687     0      0          0           0
#> 21     Adamaoua 2021-01-04    1518189     2      0          2           0
#> 22       Centre 2021-01-04    4965861   107      0        105           0
#> 23          Est 2021-01-04    1360451     0      0          0           0
#> 24 Extreme Nord 2021-01-04    4967788     3      0          3           0
#> 25     Littoral 2021-01-04    4277464     0      0          0           0
#> 26         Nord 2021-01-04    2996271     0      0          0           0
#> 27   Nord-Ouest 2021-01-04    1868031     0      0          0           0
#> 28        Ouest 2021-01-04    2327807     0      0          0           0
#> 29          Sud 2021-01-04     894878     0      0          0           0
#> 30    Sud-Ouest 2021-01-04    1899941     0      0          0           0
#> 31     Adamaoua 2021-01-05    1518189     0      0          0           0
#> 32       Centre 2021-01-05    4965861    82      0         82           0
#> 33          Est 2021-01-05    1360451     3      0          0           0
#> 34 Extreme Nord 2021-01-05    4967788     8      0          5           0
#> 35     Littoral 2021-01-05    4277464    20      0         12           0
#> 36         Nord 2021-01-05    2996271     0      0          0           0
#> 37   Nord-Ouest 2021-01-05    1868031     0      0          0           0
#> 38        Ouest 2021-01-05    2327807     0      0          0           0
#> 39          Sud 2021-01-05     894878     0      0          0           0
#> 40    Sud-Ouest 2021-01-05    1899941     0      0          0           0
#> 41     Adamaoua 2021-01-06    1518189     4      0          4           0
#> 42       Centre 2021-01-06    4965861   105      0        101           0
#> 43          Est 2021-01-06    1360451     0      0          0           0
#> 44 Extreme Nord 2021-01-06    4967788     1      0          1           0
#> 45     Littoral 2021-01-06    4277464    63      0         62           0
#> 46         Nord 2021-01-06    2996271    16      0          0           0
#> 47   Nord-Ouest 2021-01-06    1868031     0      0          0           0
#> 48        Ouest 2021-01-06    2327807     2      0          2           0
#> 49          Sud 2021-01-06     894878     0      0          0           0
#> 50    Sud-Ouest 2021-01-06    1899941     8      0          8           0

^{Created on 2023-07-26 with reprex v2.0.2}

Session info

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-apple-darwin20 (64-bit)
#> Running under: macOS Ventura 13.4.1
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> time zone: America/Toronto
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] tibble_3.2.1 dplyr_1.1.2 
#> 
#> loaded via a namespace (and not attached):
#>  [1] vctrs_0.6.3       cli_3.6.1         knitr_1.43        rlang_1.1.1      
#>  [5] xfun_0.39         purrr_1.0.1       styler_1.10.1     generics_0.1.3   
#>  [9] glue_1.6.2        htmltools_0.5.5   fansi_1.0.4       rmarkdown_2.23   
#> [13] R.cache_0.16.0    evaluate_0.21     fastmap_1.1.1     yaml_2.3.7       
#> [17] lifecycle_1.0.3   compiler_4.3.1    fs_1.6.3          pkgconfig_2.0.3  
#> [21] rstudioapi_0.15.0 R.oo_1.25.0       R.utils_2.12.2    digest_0.6.33    
#> [25] R6_2.5.1          tidyselect_1.2.0  utf8_1.2.3        reprex_2.0.2     
#> [29] pillar_1.9.0      magrittr_2.0.3    R.methodsS3_1.8.2 tools_4.3.1      
#> [33] withr_2.5.0

However, I am somewhat confused by your data as it seems like you do not have cumulative counts (they are not monotonically increasing) and you are ignoring region - that being said, this code agrees with the results from your code.

All the best,

Tim