Regression Model In R Programming
Sachin Sharma
10/7/2021
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(naniar)
library(dplyr)
library(datasets)
library(tinytex)
library(DT)
data("mtcars")
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
Transform certain variables into factors
mtcars$cyl <- factor(mtcars$cyl)
mtcars$am <- factor(mtcars$am,labels=c("Automatic","Manual"))
mtcars$vs <- factor(mtcars$vs)
mtcars$gear <- factor(mtcars$gear)
mtcars$carb <- factor(mtcars$carb)
boxplot(mpg ~ am, data = mtcars, col = (c("purple","red")), ylab = "Miles Per Gallon", xlab = "Type of Transmission", main = "MPG Vs AM")
aggregate(mpg~am, data = mtcars, mean)
## am mpg
## 1 Automatic 17.14737
## 2 Manual 24.39231
Difference of MPG between Automatic and Manual
24.39231 - 17.14737
## [1] 7.24494
Therefore, we can see that the Manual cars have an MPG of 7.245 (approx.) more than automatic cars
We can now use a t-test here
automatic_car <- mtcars[mtcars$am == "Automatic",]
manual_car <- mtcars[mtcars$am == "Manual",]
t.test(automatic_car$mpg, manual_car$mpg)
##
## Welch Two Sample t-test
##
## data: automatic_car$mpg and manual_car$mpg
## t = -3.7671, df = 18.332, p-value = 0.001374
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -11.280194 -3.209684
## sample estimates:
## mean of x mean of y
## 17.14737 24.39231
We can see that the p-value is 0.001374, thus we can state this is a significant difference. Now to quantify this, we can use the following code :
model_1 <- lm(mpg ~ am, data = mtcars)
summary(model_1)
##
## Call:
## lm(formula = mpg ~ am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3923 -3.0923 -0.2974 3.2439 9.5077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.147 1.125 15.247 1.13e-15 ***
## amManual 7.245 1.764 4.106 0.000285 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.902 on 30 degrees of freedom
## Multiple R-squared: 0.3598, Adjusted R-squared: 0.3385
## F-statistic: 16.86 on 1 and 30 DF, p-value: 0.000285
Lets see with the help of corrplot , to check the correlation among the variables with mpg.
Before plotting the corrplot, we will check the structure of the data ;
df_1 <- subset(mtcars, select = c(mpg,cyl,disp,hp,drat,wt,qsec,vs))
head(df_1)
## mpg cyl disp hp drat wt qsec vs
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1
str(df_1)
## 'data.frame': 32 obs. of 8 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
Here we can see that, cyl and vs columns are in factor, we will now convert this into numeric to plot corrplot and check the correlation.
df_1$cyl <- as.character(df_1$cyl)
df_1$cyl <- as.numeric(df_1$cyl)
df_1$vs <- as.character(df_1$vs)
df_1$vs <- as.numeric(df_1$vs)
# Now we can check the structure of the data again
str(df_1)
## 'data.frame': 32 obs. of 8 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
Now we can see that all the columns are in numeric, now we can plot wit the help of ggcorrplot and corrplot to check the correlation :
library(ggcorrplot)
r <- cor(df_1)
ggcorrplot(r,method = "circle", type = c("upper"), legend.title = "Corrplot MTCARS")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
library(corrplot)
## corrplot 0.90 loaded
r <- cor(df_1)
corrplot(r, method = "circle")
model_2 <- lm(mpg~am + cyl + disp + hp + wt, data = mtcars)
anova(model_1, model_2)
## Analysis of Variance Table
##
## Model 1: mpg ~ am
## Model 2: mpg ~ am + cyl + disp + hp + wt
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 30 720.90
## 2 25 150.41 5 570.49 18.965 8.637e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
\textcolor{blue}{\textbf{ Here we can see that the result of p-value is 8.637e-08, and hence we can say that our model_2 is significantly better than our model_1 which is a simple model.}}
par(mfrow = c(2,2))
plot(model_2)
Now we will check the summary of our model_2
summary(model_2)
##
## Call:
## lm(formula = mpg ~ am + cyl + disp + hp + wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9374 -1.3347 -0.3903 1.1910 5.0757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.864276 2.695416 12.564 2.67e-12 ***
## amManual 1.806099 1.421079 1.271 0.2155
## cyl6 -3.136067 1.469090 -2.135 0.0428 *
## cyl8 -2.717781 2.898149 -0.938 0.3573
## disp 0.004088 0.012767 0.320 0.7515
## hp -0.032480 0.013983 -2.323 0.0286 *
## wt -2.738695 1.175978 -2.329 0.0282 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.453 on 25 degrees of freedom
## Multiple R-squared: 0.8664, Adjusted R-squared: 0.8344
## F-statistic: 27.03 on 6 and 25 DF, p-value: 8.861e-10
// add bootstrap table styles to pandoc tables function bootstrapStylePandocTables() { $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed'); } $(document).ready(function () { bootstrapStylePandocTables(); });