Hello Community,
This is the code I’m working on. The real dataset has more than 20000 rows. so when I run this code it takes hours (My machine have 24GO of RAM). How can I arrange this to be faster ? Or is there any other code I can use ? Thank you
Aureol
# Load the packages to be used -----------------------------------------------
pacman::p_load (tidyverse,here,rio,reprex)
# Generate the data without covid database -----------------------
demo_data <- data.frame(
region = c("Adamaoua",
"Centre","Est",
"Extreme Nord",
"Littoral","Nord",
"Nord-Ouest","Ouest","Sud",
"Sud-Ouest",
"Adamaoua","Centre",
"Est","Extreme Nord",
"Littoral","Nord",
"Nord-Ouest",
"Ouest","Sud",
"Sud-Ouest","Adamaoua",
"Centre","Est",
"Extreme Nord","Littoral",
"Nord","Nord-Ouest",
"Ouest","Sud",
"Sud-Ouest",
"Adamaoua","Centre","Est",
"Extreme Nord",
"Littoral","Nord",
"Nord-Ouest","Ouest",
"Sud","Sud-Ouest",
"Adamaoua","Centre",
"Est",
"Extreme Nord","Littoral",
"Nord","Nord-Ouest",
"Ouest","Sud",
"Sud-Ouest"),
date = c("2021-01-02",
"2021-01-02",
"2021-01-02",
"2021-01-02","2021-01-02",
"2021-01-02",
"2021-01-02","2021-01-02",
"2021-01-02",
"2021-01-02","2021-01-03",
"2021-01-03",
"2021-01-03",
"2021-01-03","2021-01-03",
"2021-01-03",
"2021-01-03","2021-01-03",
"2021-01-03",
"2021-01-03","2021-01-04",
"2021-01-04",
"2021-01-04",
"2021-01-04","2021-01-04",
"2021-01-04",
"2021-01-04","2021-01-04", "2021-01-04",
"2021-01-04","2021-01-05", "2021-01-05",
"2021-01-05", "2021-01-05","2021-01-05",
"2021-01-05",
"2021-01-05","2021-01-05",
"2021-01-05",
"2021-01-05","2021-01-06",
"2021-01-06",
"2021-01-06",
"2021-01-06","2021-01-06",
"2021-01-06",
"2021-01-06","2021-01-06",
"2021-01-06",
"2021-01-06"),
population = c(1345934,
4846001.81701392,
1146981,4824522.05380686,
3987222,
2964767.81627007,
2244288.13462958,2113367,
818190,1862687,1345934,
4846001.81701392,
1146981,
4824522.05380686,3987222,
2964767.81627007,
2244288.13462958,2113367,
818190,1862687,
1518189,4965861,
1360451,4967788,4277464,
2996271,1868031,
2327807,894878,
1899941,1518189,
4965861,1360451,4967788,
4277464,2996271,
1868031,2327807,
894878,1899941,
1518189,4965861,1360451,
4967788,4277464,
2996271,1868031,
2327807,894878,
1899941),
cases = c(2,181,0,2,
20,10,0,0,0,0,0,
9,0,0,100,83,
0,0,110,0,2,107,
0,3,0,0,0,0,0,
0,0,82,3,8,20,
0,0,0,0,0,4,
105,0,1,63,16,0,
2,0,8),
deaths = c(0,20,0,1,0,
0,0,1,0,0,0,
0,0,0,0,0,0,0,
0,0,0,0,0,0,0,
0,0,0,0,0,0,
0,0,0,0,0,0,0,
0,0,0,0,0,0,0,
0,0,0,0,0)
)
## Function to compute the news cases and new deaths ---------------------------
for (i in c(1:nrow(demo_data))) {
# Create the covid_subset and make sure that the date are ordered
covid_subset <- demo_data[order(demo_data$date),]
covid_subset$new_cases <- covid_subset$cases[1]
covid_subset$new_deaths <- covid_subset$deaths[1]
# Compute the number of new cases and new deaths from the second row in the dataset
for (j in 2:nrow(covid_subset)) {
covid_subset$new_cases[j] = covid_subset$cases[j] - covid_subset$cases[j-1]
covid_subset$new_deaths[j] = covid_subset$deaths[j] - covid_subset$deaths[j-1]
}
# set negative new case or death counts to 0
covid_subset$new_cases[covid_subset$new_cases<0] <- 0
covid_subset$new_deaths[covid_subset$new_deaths<0] <- 0
# Fold into the main data set
demo_data$new_cases[i] <- covid_subset$new_cases[i]
demo_data$new_deaths[i] <- covid_subset$new_deaths[i]
}
Created on 2023-07-05 with reprex v2.0.2
Session info
sessionInfo()
#> R version 4.3.1 (2023-06-16 ucrt)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 11 x64 (build 22000)
#>
#> Matrix products: default
#>
#>
#> locale:
#> [1] LC_COLLATE=English_United States.utf8
#> [2] LC_CTYPE=English_United States.utf8
#> [3] LC_MONETARY=English_United States.utf8
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.utf8
#>
#> time zone: America/Los_Angeles
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] reprex_2.0.2 rio_0.5.29 here_1.0.1 lubridate_1.9.2
#> [5] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.2 purrr_1.0.1
#> [9] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.2
#> [13] tidyverse_2.0.0
#>
#> loaded via a namespace (and not attached):
#> [1] utf8_1.2.3 generics_0.1.3 stringi_1.7.12 hms_1.1.3
#> [5] digest_0.6.32 magrittr_2.0.3 evaluate_0.21 grid_4.3.1
#> [9] timechange_0.2.0 fastmap_1.1.1 cellranger_1.1.0 rprojroot_2.0.3
#> [13] zip_2.3.0 fansi_1.0.4 scales_1.2.1 cli_3.6.1
#> [17] rlang_1.1.1 munsell_0.5.0 withr_2.5.0 yaml_2.3.7
#> [21] tools_4.3.1 tzdb_0.4.0 colorspace_2.1-0 pacman_0.5.1
#> [25] curl_5.0.1 vctrs_0.6.3 R6_2.5.1 lifecycle_1.0.3
#> [29] fs_1.6.2 foreign_0.8-84 pkgconfig_2.0.3 pillar_1.9.0
#> [33] openxlsx_4.2.5.2 gtable_0.3.3 glue_1.6.2 data.table_1.14.8
#> [37] Rcpp_1.0.10 haven_2.5.2 xfun_0.39 tidyselect_1.2.0
#> [41] rstudioapi_0.14 knitr_1.43 htmltools_0.5.5 rmarkdown_2.23
#> [45] compiler_4.3.1 readxl_1.4.2