Hello, I face the same issue and I cant seem to get out of it.
Please help.
here is my code.
bike_df ← read_xlsx(“1657875746_day.xlsx”)
head(bike_df,5)
attach(bike_df)
#Create new dataset excluding casual and registered variables
bike_df<-subset(bike_df,select=-c(casual,registered))
head(bike_df,5)
#Dimension of dataset
dim(bike_df)
#Summary of the dataset
summary(bike_df)
#Structure of dataset
str(bike_df)
#Rename the columns
names(bike_df)<-c(‘rec_id’,‘datetime’,‘season’,‘year’,‘month’,‘holiday’,‘weekday’,‘workingday’,‘weather_condition’,‘temp’,‘atemp’,‘humidity’,‘windspeed’,‘total_count’)
#Read the data
head(bike_df,5)
#Typecasting the datetime and numerical attributes to category
bike_df$datetime<- as.Date(bike_df$datetime)
bike_df$year<-as.factor(bike_df$year)
bike_df$month<-as.factor(bike_df$month)
bike_df$season ← as.factor(bike_df$season)
bike_df$holiday<- as.factor(bike_df$holiday)
bike_df$weekday<- as.factor(bike_df$weekday)
bike_df$workingday<- as.factor(bike_df$workingday)
bike_df$weather_condition<- as.factor(bike_df$weather_condition)
#Missing values in dataset
missing_val<-data.frame(apply(bike_df,2,function(x){sum(is.na(x))}))
names(missing_val)[1]=‘missing_val’
missing_val
#column plot for season wise monthly distribution of counts
ggplot(bike_df,aes(x=month,y=total_count,fill=season))+theme_bw()+geom_col()+
labs(x=‘Month’,y=‘Total_Count’,title=‘Season wise monthly distribution of counts’)
#column plot for weekday wise monthly distribution of counts
ggplot(bike_df,aes(x=month,y=total_count,fill=weekday))+theme_bw()+geom_col()+
labs(x=‘Month’,y=‘Total_Count’,title=‘Weekday wise monthly distribution of counts’)
#Violin plot for Yearly wise distribution of counts
ggplot(bike_df,aes(x=year,y=total_count,fill=year))+geom_violin()+theme_bw()+
labs(x=‘Year’,y=‘Total_Count’,title=‘Yearly wise distribution of counts’)
#Column plot for holiday wise distribution of counts
ggplot(bike_df,aes(x=holiday,y=total_count,fill=season))+geom_col()+theme_bw()+
labs(x=‘holiday’,y=‘Total_Count’,title=‘Holiday wise distribution of counts’)
#Column plot for workingday wise distribution of counts
ggplot(bike_df,aes(x=workingday,y=total_count,fill=season))+geom_col()+theme_bw()+
labs(x=‘workingday’,y=‘Total_Count’,title=‘Workingday wise distribution of counts’)
#Column plot for weather_condition distribution of counts
ggplot(bike_df,aes(x=weather_condition,y=total_count,fill=season))+geom_col()+theme_bw()+
labs(x=‘Weather_condition’,y=‘total_count’,title=‘Weather_condition distribution of counts’)
#boxplot for total_count_outliers
par(mfrow=c(1, 1))#divide graph area in 1 columns and 1 rows
boxplot(bike_df$total_count,main=‘Total_count’,sub=paste(boxplot.stats(bike_df$total_count)$out))
#box plots for outliers
par(mfrow=c(2,2))
#Box plot for temp outliers
boxplot(bike_df$temp, main=“Temp”,sub=paste(boxplot.stats(bike_df$temp)$out))
#Box plot for humidity outliers
boxplot(bike_df$humidity,main=“Humidity”,sub=paste(boxplot.stats(bike_df$humidity)$out))
#Box plot for windspeed outliers
boxplot(bike_df$windspeed,main=“Windspeed”,sub=paste(boxplot.stats(bike_df$windspeed)$out))
#load the DMwR library
library(DMwR2)
#create subset for windspeed and humidity variable
wind_hum<-subset(bike_df,select=c(‘windspeed’,‘humidity’))
#column names of wind_hum
cnames<-colnames(wind_hum)
for(i in cnames){
val=wind_hum[,i][wind_hum[,i] %in% boxplot.stats(wind_hum[,i])$out] #outlier values
wind_hum[,i][wind_hum[,i] %in% val]= NA # Replace outliers with NA
}
this last code doesnt seem to work.
please suggest how to get through it.