Group_by and summarize not working

Echo_Bajador · November 13, 2023, 3:37am

Hi,

I used the following script to generate (1) weekly no.of Rt-PCR tests and (2) weekly no.of Rt-PCR tests per 1,000 population per region. Unfortunately, I cannot figure out why I keep on encountering error for the object “weekly_counts” - particularly “error in summarize(., Weeklytest = sum(n_samples)) : argument “by” is missing, with no default” even if I have a “group_by” command prior to summarize.

pacman:: p_load(
rio,
here,
janitor,
lubridate,
tidyverse,
ISOweek,
ggplot2,
data.table
)

test ← import(here(“data”, “raw”, “sample_testing.csv”)) %>%
clean_names() %>%
mutate(report_date=mdy(report_date)) %>%
mutate(report_date = format(report_date, “%m-%d-%y”)) %>%
rename(
n_samples = daily_output_samples_tested,
date_report = report_date) %>%
select(region,
n_samples,
date_report,) %>%
mutate(date_report =as.Date(date_report, “%m-%d-%y”)) %>%
mutate(
date_week = floor_date(date_report, week_start = 1, unit = “week”),
week_report = week(date_week),
year_report = year(date_week)
)

weekly_counts ← test %>%
group_by(date_week) %>% #Error in summarize(., Weeklytest = sum(n_samples)) : argument “by” is missing, with no default
summarize(Weeklytest= sum(n_samples))

test1k ← weekly_counts %>%
mutate(“Tests per 1000 pop’n” = (n_samples/112892781)*1000) %>%
rename (“No.of RT-PCR samples tested” = n_samples)

I hope you could enlighten me.
Thank you.

Respectfully,
Echo

lnielsen · November 13, 2023, 2:29pm

Hello @Echo_Bajador, understanding the root cause of the error would be easier with a closer look at your data. Your syntax and function usage seem correct. The issue might be related to the date_week or n_samples variables. If possible, sharing a snippet of your dataset would greatly facilitate troubleshooting. You can do that by pasting the output of dput(head(test)) . However, if your data contains sensitive information, you can still help by pasting the output of str(test$date_week) and str(test$n_samples) . This will provide valuable insights without compromising privacy.

machupovirus · November 13, 2023, 2:39pm

Hello,

As @lnielsen stated, posing your question using the reprex package would be much more informative. There is a post on this forum providing further details on how to do so.

All the best,

Tim

Echo_Bajador · November 13, 2023, 11:28pm

Hi thank you, I tried to close the project and rerun everything, it worked(group_by and summarize), except now that I have a different problem with the ggplot.
The code runs but it is not “reading” correctly the x-axis dates (date_weeks) and y-axis tests per 1k pop’n (Tests1k).

Here is my script:

pacman:: p_load(
  rio,
  here,
  janitor,
  lubridate,
  tidyverse,
  ISOweek,
  ggplot2,
  data.table,
  reprex
)

test <-  import(here("data", "raw", "sample_testing.csv")) %>%
  clean_names() %>%
  mutate(report_date=mdy(report_date)) %>%
  mutate(report_date = format(report_date, "%m-%d-%y")) %>%
  rename(
n_samples = daily_output_samples_tested,
date_report = report_date) %>%
select(region,
n_samples,
date_report,) %>%
  mutate(date_report =as.Date(date_report, "%m-%d-%y")) %>%
mutate(
date_week = floor_date(date_report, week_start = 1, unit = "week"),
week_report = week(date_week),
year_report = year(date_week))

                    
#weekly tests                                        
weekly_counts <-  test %>%
group_by(date_week, week_report, year_report, region) %>% 
summarize(Weeklytest= sum(n_samples)) 


#test per 1k popn
test1k <-  weekly_counts %>%
  mutate("Tests1k" = (Weeklytest/112892781)*1000) %>%
  rename ("No.of RT-PCR samples tested" = Weeklytest) 

#rounded off test per 1k popn
xa <- (test1k["Tests1k"]) 
x1 <- round(xa, digits = 2)
testround <- bind_cols(test1k, x1) 



##problem with ggplot
ggplot(testround) +
  geom_area(
    aes(x = date_week, y = "Tests1k7"), fill="#3281B5") +
  facet_wrap(~region, nrow = 6, scales = "free_y")  +
  xlab("") + 
  ylab("Number of Rt-PCR tests conducted per 1k pop'n") +   
  scale_x_date(date_breaks = "1 month", 
               labels = function (x){
                 month_labels <- format(x, "%b")
                 year_labels <-  ifelse(format(x, "%m") == "06", format(x, "%Y"), "")
                 paste(month_labels, "\n", year_labels)
               }) + 
  theme_minimal() +
  theme(axis.text=element_text(size = 8),
        axis.text.x = element_text(vjust = 0.5, hjust=0.2), 
        axis.title= element_text(size = 10,face= "bold"))

dput(head(testround))
#> Error in eval(expr, envir, enclos): object 'testround' not found
structure(list(date_week = structure(c(18351, 18351, 18351, 18351, 
18351, 18358), class = "Date"), week_report = c(13L, 13L, 13L, 
13L, 13L, 14L), year_report = c(2020L, 2020L, 2020L, 2020L, 2020L, 
2020L), region = c("Cordillera Administrative Region (CAR)", 
"National Capital Region (NCR)", "Region VI: Western Visayas", 
"Region VII: Central Visayas", "Region XI: Davao Region", "Cordillera Administrative Region (CAR)"
), `No.of RT-PCR samples tested` = c(472L, 3692L, 385L, 348L, 
388L, 1030L), Tests1k...6 = c(0.00418095821379403, 0.0327035968756939, 
0.00341031549218369, 0.00308257088644136, 0.00343688937913577, 
0.00912370118688103), Tests1k...7 = c(0, 0.03, 0, 0, 0, 0.01)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), groups = structure(list(
    date_week = structure(c(18351, 18358), class = "Date"), week_report = 13:14, 
    year_report = c(2020L, 2020L), .rows = structure(list(1:5, 
        6L), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", 
    "list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-2L), .drop = TRUE))
#>    date_week week_report year_report                                 region
#> 1 2020-03-30          13        2020 Cordillera Administrative Region (CAR)
#> 2 2020-03-30          13        2020          National Capital Region (NCR)
#> 3 2020-03-30          13        2020             Region VI: Western Visayas
#> 4 2020-03-30          13        2020            Region VII: Central Visayas
#> 5 2020-03-30          13        2020                Region XI: Davao Region
#> 6 2020-04-06          14        2020 Cordillera Administrative Region (CAR)
#>   No.of RT-PCR samples tested Tests1k...6 Tests1k...7
#> 1                         472 0.004180958        0.00
#> 2                        3692 0.032703597        0.03
#> 3                         385 0.003410315        0.00
#> 4                         348 0.003082571        0.00
#> 5                         388 0.003436889        0.00
#> 6                        1030 0.009123701        0.01

Thank you very much.

Regards,
Echo

lnielsen · November 13, 2023, 11:55pm

Thanks for sharing your data.
It seems that the problem with your ggplot code is on this line

geom_area(
    aes(x = date_week, y = "Tests1k7"), fill="#3281B5")

When referring to variables inside aes(), it’s essential not to enclose them in quotation marks.

Please attempt the following modification and let me know if it resolves the issue:

geom_area(
    aes(x = date_week, y = Tests1k7), fill="#3281B5")

Feel free to reach out if you encounter any further difficulties.

Echo_Bajador · November 14, 2023, 3:07am

Oh the speech marks hehe. It works.
Thank you.