Predicting-age-of-abalone-using-Multiple-Linear-Regression.R

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(ggplot2))
#install.packages('corrplot')
suppressPackageStartupMessages(library(corrplot))
#install.packages('moments')
suppressPackageStartupMessages(library(moments))
#install.packages('GGally')
suppressPackageStartupMessages(library(GGally))
#install.packages('faraway')
suppressPackageStartupMessages(library(faraway))
#install.packages('olsrr')
suppressPackageStartupMessages(library(olsrr))
#install.packages('lmtest')
suppressPackageStartupMessages(library(MASS))
#install.packages('MASS')
#install.packages('ggfortify')
suppressPackageStartupMessages(library(ggfortify))
#install.packages('broom')
suppressPackageStartupMessages(library(broom))
#install.packages('jtools')
#install.packages('huxtable')
suppressPackageStartupMessages(library(huxtable))
suppressPackageStartupMessages(library(jtools))
#install.packages('cowplot')
suppressPackageStartupMessages(library(cowplot))






abalone <- readr::read_csv('C:/personal files/data analytics/docs/git proj/abalone age prediction/Data-Science-Project/abalone.csv', show_col_types = FALSE)


head(abalone)

sex	length	diameter	height	whole_wt	shucked_wt	viscera_wt	shell_wt	rings	age
M	0.455	0.365	0.095	0.514	0.224	0.101	0.15	15	16.5
M	0.35	0.265	0.09	0.226	0.0995	0.0485	0.07	7	8.5
F	0.53	0.42	0.135	0.677	0.257	0.141	0.21	9	10.5
M	0.44	0.365	0.125	0.516	0.215	0.114	0.155	10	11.5
I	0.33	0.255	0.08	0.205	0.0895	0.0395	0.055	7	8.5
I	0.425	0.3	0.095	0.351	0.141	0.0775	0.12	8	9.5

str(abalone)

## spec_tbl_df [4,177 x 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ sex       : chr [1:4177] "M" "M" "F" "M" ...
##  $ length    : num [1:4177] 0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
##  $ diameter  : num [1:4177] 0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
##  $ height    : num [1:4177] 0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
##  $ whole_wt  : num [1:4177] 0.514 0.226 0.677 0.516 0.205 ...
##  $ shucked_wt: num [1:4177] 0.2245 0.0995 0.2565 0.2155 0.0895 ...
##  $ viscera_wt: num [1:4177] 0.101 0.0485 0.1415 0.114 0.0395 ...
##  $ shell_wt  : num [1:4177] 0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
##  $ rings     : num [1:4177] 15 7 9 10 7 8 20 16 9 19 ...
##  $ age       : num [1:4177] 16.5 8.5 10.5 11.5 8.5 9.5 21.5 17.5 10.5 20.5 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   sex = col_character(),
##   ..   length = col_double(),
##   ..   diameter = col_double(),
##   ..   height = col_double(),
##   ..   whole_wt = col_double(),
##   ..   shucked_wt = col_double(),
##   ..   viscera_wt = col_double(),
##   ..   shell_wt = col_double(),
##   ..   rings = col_double(),
##   ..   age = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

dim(abalone)

## [1] 4177   10

##Convering sex variable to factor type
abalone$sex = as.factor(abalone$sex)
glimpse(abalone)

## Rows: 4,177
## Columns: 10
## $ sex        <fct> M, M, F, M, I, I, F, F, M, F, F, M, M, F, F, M, I, F, M, M,~
## $ length     <dbl> 0.455, 0.350, 0.530, 0.440, 0.330, 0.425, 0.530, 0.545, 0.4~
## $ diameter   <dbl> 0.365, 0.265, 0.420, 0.365, 0.255, 0.300, 0.415, 0.425, 0.3~
## $ height     <dbl> 0.095, 0.090, 0.135, 0.125, 0.080, 0.095, 0.150, 0.125, 0.1~
## $ whole_wt   <dbl> 0.5140, 0.2255, 0.6770, 0.5160, 0.2050, 0.3515, 0.7775, 0.7~
## $ shucked_wt <dbl> 0.2245, 0.0995, 0.2565, 0.2155, 0.0895, 0.1410, 0.2370, 0.2~
## $ viscera_wt <dbl> 0.1010, 0.0485, 0.1415, 0.1140, 0.0395, 0.0775, 0.1415, 0.1~
## $ shell_wt   <dbl> 0.150, 0.070, 0.210, 0.155, 0.055, 0.120, 0.330, 0.260, 0.1~
## $ rings      <dbl> 15, 7, 9, 10, 7, 8, 20, 16, 9, 19, 14, 10, 11, 10, 10, 12, ~
## $ age        <dbl> 16.5, 8.5, 10.5, 11.5, 8.5, 9.5, 21.5, 17.5, 10.5, 20.5, 15~

summary(abalone)

##  sex          length         diameter          height          whole_wt     
##  F:1307   Min.   :0.075   Min.   :0.0550   Min.   :0.0000   Min.   :0.0020  
##  I:1342   1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150   1st Qu.:0.4415  
##  M:1528   Median :0.545   Median :0.4250   Median :0.1400   Median :0.7995  
##           Mean   :0.524   Mean   :0.4079   Mean   :0.1395   Mean   :0.8287  
##           3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650   3rd Qu.:1.1530  
##           Max.   :0.815   Max.   :0.6500   Max.   :1.1300   Max.   :2.8255  
##    shucked_wt       viscera_wt        shell_wt          rings       
##  Min.   :0.0010   Min.   :0.0005   Min.   :0.0015   Min.   : 1.000  
##  1st Qu.:0.1860   1st Qu.:0.0935   1st Qu.:0.1300   1st Qu.: 8.000  
##  Median :0.3360   Median :0.1710   Median :0.2340   Median : 9.000  
##  Mean   :0.3594   Mean   :0.1806   Mean   :0.2388   Mean   : 9.934  
##  3rd Qu.:0.5020   3rd Qu.:0.2530   3rd Qu.:0.3290   3rd Qu.:11.000  
##  Max.   :1.4880   Max.   :0.7600   Max.   :1.0050   Max.   :29.000  
##       age       
##  Min.   : 2.50  
##  1st Qu.: 9.50  
##  Median :10.50  
##  Mean   :11.43  
##  3rd Qu.:12.50  
##  Max.   :30.50

# seperating categorical variable
abl_cat =abalone %>% 
dplyr::select(sex)  %>% 
 group_by(sex)


#analysing proportion of data on the basis of sex
sex_prop =abl_cat %>%
summarise(count_n=n())%>%
mutate(prop.= paste0(round(count_n/sum(count_n)*100 , 2) , "%" ))
print(sex_prop)

## # A tibble: 3 x 3
##   sex   count_n prop. 
##   <fct>   <int> <chr> 
## 1 F        1307 31.29%
## 2 I        1342 32.13%
## 3 M        1528 36.58%

# This is a chart, switch to the DataCamp editor to view and configure it.


missing_data= function(x) {
    name_var= c()
    missing_values = c()
    for (i in 1: x ){
     name_var[i]=(names(abalone[i]))
     missing_values[i] =(sum(is.na(abalone[i])))
     }
  return(data.frame(name_var, missing_values ))
    
}

as.data.frame(lapply((ncol(abalone)) , missing_data))

name_var	missing_values
sex	0
length	0
diameter	0
height	0
whole_wt	0
shucked_wt	0
viscera_wt	0
shell_wt	0
rings	0
age	0

dup_data= sum(duplicated(abalone))
paste0("There are " ,dup_data , " " , "full duplicates in the dataset")

## [1] "There are 0 full duplicates in the dataset"

ggplot(stack(abalone), aes(x = ind, y = values , color= ind) )+
  geom_boxplot()+
  labs(title ="Boxplot")

## Warning in stack.data.frame(abalone): non-vector columns will be ignored

##standarization of data.
std_num_abl= function(y) {
    
    standarization =(y - mean(y)) / sd(y)
    return(standarization)
}

abl_num=as.data.frame(lapply(abalone[, -1],std_num_abl ))
ggplot(stack(abl_num), aes(x = ind, y = values , color= ind) )+
  geom_boxplot()+
  labs(title ="Boxplot")

abl_sum=as.data.frame(sapply(abalone[,-1], summary))
abl_sum

length	diameter	height	whole_wt	shucked_wt	viscera_wt	shell_wt	rings	age
0.075	0.055	0	0.002	0.001	0.0005	0.0015	1	2.5
0.45	0.35	0.115	0.442	0.186	0.0935	0.13	8	9.5
0.545	0.425	0.14	0.799	0.336	0.171	0.234	9	10.5
0.524	0.408	0.14	0.829	0.359	0.181	0.239	9.93	11.4
0.615	0.48	0.165	1.15	0.502	0.253	0.329	11	12.5
0.815	0.65	1.13	2.83	1.49	0.76	1	29	30.5

abl_skew = as.data.frame(sapply (abalone[,-1], skewness))
abl_kur = as.data.frame (sapply(abalone[,-1], kurtosis))
abl_var = as.data.frame(sapply(abalone[,-1] , var))

as.data.frame(cbind(var = abl_var, skew= abl_skew , abl_kur))

sapply(abalone[, -1], var)	sapply(abalone[, -1], skewness)	sapply(abalone[, -1], kurtosis)
0.0144	-0.64	3.06
0.00985	-0.609	2.95
0.00175	3.13	78.9
0.24	0.531	2.97
0.0493	0.719	3.59
0.012	0.592	3.08
0.0194	0.621	3.53
10.4	1.11	5.33
10.4	1.11	5.33

#REMOVING OUTLIER 
abl_numerical = as.data.frame(abalone[,-1])

v= list()

abl_outliers=for (i in 1:ncol(abl_num)){
    
     name_out= (names(abl_numerical))[i]
    abliout = boxplot.stats(abl_numerical[,i])$out
    v[[(names(abl_numerical))[i]]]=  boxplot.stats(abl_numerical[,i])$out
    len_out = length(abliout)
    print(c(name_out ,paste("number of outliers" , len_out)))
    print(sort(abliout))

}

## [1] "length"                "number of outliers 49"
##  [1] 0.075 0.110 0.130 0.130 0.135 0.140 0.140 0.150 0.155 0.155 0.155 0.160
## [13] 0.160 0.160 0.160 0.165 0.165 0.165 0.165 0.165 0.170 0.170 0.170 0.175
## [25] 0.175 0.175 0.175 0.175 0.180 0.180 0.180 0.180 0.185 0.185 0.185 0.185
## [37] 0.190 0.190 0.190 0.190 0.195 0.195 0.195 0.200 0.200 0.200 0.200 0.200
## [49] 0.200
## [1] "diameter"              "number of outliers 59"
##  [1] 0.055 0.090 0.095 0.100 0.100 0.105 0.105 0.105 0.105 0.110 0.110 0.110
## [13] 0.110 0.115 0.115 0.120 0.120 0.120 0.120 0.120 0.125 0.125 0.125 0.125
## [25] 0.125 0.125 0.125 0.130 0.130 0.130 0.130 0.130 0.130 0.130 0.130 0.135
## [37] 0.135 0.135 0.135 0.135 0.140 0.140 0.140 0.145 0.145 0.145 0.145 0.145
## [49] 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150
## [1] "height"                "number of outliers 29"
##  [1] 0.000 0.000 0.010 0.015 0.015 0.020 0.020 0.025 0.025 0.025 0.025 0.025
## [13] 0.030 0.030 0.030 0.030 0.030 0.030 0.035 0.035 0.035 0.035 0.035 0.035
## [25] 0.250 0.250 0.250 0.515 1.130
## [1] "whole_wt"              "number of outliers 30"
##  [1] 2.2205 2.2260 2.2305 2.2355 2.2360 2.2385 2.2500 2.2550 2.2635 2.2695
## [11] 2.2730 2.3020 2.3235 2.3305 2.3330 2.3810 2.3810 2.3980 2.4925 2.4990
## [21] 2.5050 2.5085 2.5155 2.5260 2.5480 2.5500 2.5550 2.6570 2.7795 2.8255
## [1] "shucked_wt"            "number of outliers 48"
##  [1] 0.9815 0.9840 0.9895 0.9915 0.9925 0.9955 1.0050 1.0070 1.0120 1.0135
## [11] 1.0170 1.0260 1.0265 1.0300 1.0465 1.0465 1.0515 1.0615 1.0685 1.0705
## [21] 1.0715 1.0745 1.0815 1.0830 1.0950 1.1055 1.1075 1.1090 1.1155 1.1280
## [31] 1.1330 1.1335 1.1455 1.1455 1.1465 1.1495 1.1565 1.1705 1.1945 1.1965
## [41] 1.2320 1.2395 1.2395 1.2455 1.2530 1.3485 1.3510 1.4880
## [1] "viscera_wt"            "number of outliers 26"
##  [1] 0.4925 0.4985 0.5005 0.5090 0.5115 0.5120 0.5130 0.5145 0.5185 0.5190
## [11] 0.5195 0.5195 0.5225 0.5235 0.5250 0.5260 0.5265 0.5410 0.5410 0.5500
## [21] 0.5640 0.5745 0.5750 0.5900 0.6415 0.7600
## [1] "shell_wt"              "number of outliers 35"
##  [1] 0.6300 0.6350 0.6380 0.6420 0.6430 0.6460 0.6500 0.6550 0.6570 0.6585
## [11] 0.6600 0.6650 0.6650 0.6745 0.6750 0.6785 0.6850 0.6855 0.6900 0.7000
## [21] 0.7100 0.7100 0.7250 0.7250 0.7250 0.7260 0.7600 0.7800 0.7975 0.8150
## [31] 0.8500 0.8850 0.8850 0.8970 1.0050
## [1] "rings"                  "number of outliers 278"
##   [1]  1  2  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3 16 16 16 16 16 16 16 16
##  [26] 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
##  [51] 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
##  [76] 16 16 16 16 16 16 16 16 16 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17
## [101] 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17
## [126] 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 18 18 18 18 18 18 18 18
## [151] 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18
## [176] 18 18 18 18 18 18 18 18 18 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19
## [201] 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 20 20 20 20 20 20 20 20 20
## [226] 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 21 21 21 21 21 21 21 21
## [251] 21 21 21 21 21 21 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23 24 24 25 26
## [276] 27 27 29
## [1] "age"                    "number of outliers 278"
##   [1]  2.5  3.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5
##  [16]  4.5  4.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
##  [31] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
##  [46] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
##  [61] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
##  [76] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 18.5 18.5 18.5 18.5 18.5 18.5
##  [91] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5
## [106] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5
## [121] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5
## [136] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5
## [151] 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5
## [166] 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5
## [181] 19.5 19.5 19.5 19.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5
## [196] 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5
## [211] 20.5 20.5 20.5 20.5 20.5 20.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5
## [226] 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5
## [241] 21.5 21.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5
## [256] 22.5 23.5 23.5 23.5 23.5 23.5 23.5 24.5 24.5 24.5 24.5 24.5 24.5 24.5 24.5
## [271] 24.5 25.5 25.5 26.5 27.5 28.5 28.5 30.5

abl_num_noout= abalone %>% filter( !length  %in% v$length , !diameter  %in% v$diameter ,!height  %in% v$height ,
                         !whole_wt %in% v$whole_wt , !shell_wt %in% v$shell_wt ,!shucked_wt %in% v$shucked_wt ,
                         !viscera_wt %in% v$viscera_wt, !rings %in% v$rings)


head(abl_num_noout)

sex	length	diameter	height	whole_wt	shucked_wt	viscera_wt	shell_wt	rings	age
M	0.455	0.365	0.095	0.514	0.224	0.101	0.15	15	16.5
M	0.35	0.265	0.09	0.226	0.0995	0.0485	0.07	7	8.5
F	0.53	0.42	0.135	0.677	0.257	0.141	0.21	9	10.5
M	0.44	0.365	0.125	0.516	0.215	0.114	0.155	10	11.5
I	0.33	0.255	0.08	0.205	0.0895	0.0395	0.055	7	8.5
I	0.425	0.3	0.095	0.351	0.141	0.0775	0.12	8	9.5

paste("Total no of rows with outliers removed" ,dim(abl_numerical)[1]-dim(abl_num_noout)[1] , "hence dimension of cleaned dataset ", dim(abl_num_noout)[1] ,"x", dim(abl_num_noout)[2])

## [1] "Total no of rows with outliers removed 396 hence dimension of cleaned dataset  3781 x 10"

std_num_abl= function(y) {
    
    standarization =(y - mean(y)) / sd(y)
    return(standarization)
}

 


x= as.data.frame(lapply(abl_num_noout[,-1] ,std_num_abl ))

abl.cs = cbind(abl_num_noout[,1],x  )

summary(abl.cs)

##  sex          length           diameter           height        
##  F:1160   Min.   :-2.8340   Min.   :-2.7093   Min.   :-2.75671  
##  I:1261   1st Qu.:-0.6370   1st Qu.:-0.6501   1st Qu.:-0.77397  
##  M:1360   Median : 0.1252   Median : 0.1627   Median : 0.07578  
##           Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##           3rd Qu.: 0.7977   3rd Qu.: 0.7588   3rd Qu.: 0.78390  
##           Max.   : 2.1428   Max.   : 2.1135   Max.   : 2.90826  
##     whole_wt          shucked_wt        viscera_wt          shell_wt       
##  Min.   :-1.68568   Min.   :-1.6240   Min.   :-1.71107   Min.   :-1.73484  
##  1st Qu.:-0.80756   1st Qu.:-0.8181   1st Qu.:-0.82282   1st Qu.:-0.82296  
##  Median :-0.05762   Median :-0.1007   Median :-0.09743   Median :-0.04949  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.73167   3rd Qu.: 0.7101   3rd Qu.: 0.69212   3rd Qu.: 0.71991  
##  Max.   : 3.00287   Max.   : 3.0098   Max.   : 3.13973   Max.   : 3.24792  
##      rings              age         
##  Min.   :-2.3303   Min.   :-2.3303  
##  1st Qu.:-0.6139   1st Qu.:-0.6139  
##  Median :-0.1848   Median :-0.1848  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6734   3rd Qu.: 0.6734  
##  Max.   : 2.3899   Max.   : 2.3899

GGally::ggpairs(abl.cs, aes( color = sex ,alpha = 0.8 ), title = "Pairs plot for abalone dataset"  )+ ggplot2::theme_grey(base_size = 2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

abalone_corr = abl.cs[,c(-1, -9)]

corrplot(cor(abalone_corr) ,type = "lower" ,main="\nCorrelation matrix" , addCoef.col = 'red', number.cex=1.0 , 
        , tl.srt = 45)

par(mfrow = c(2, 2 ))
age_wholewt = ggplot (abl.cs , aes(whole_wt , age , colour = sex))+
geom_point()+geom_smooth (method  = "lm")+
labs(title =  "Relation between age vs whole_wt") 

age_shuckedwt = ggplot (abl.cs , aes(shucked_wt , age , colour = sex))+
geom_point()+geom_smooth (method  = "lm")+
labs(title =  "Relation between age vs shucked_wt")

age_shellwt = ggplot (abl.cs , aes(shell_wt , age , colour = sex))+
geom_point()+geom_smooth (method  = "lm")+
labs(title =  "Relation between age vs shell_wt")

plot_grid(age_wholewt , age_shellwt , age_shuckedwt , labels ="AUTO" )

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

set.seed(90)
#Splitting data
abl.cs = abl.cs  %>%  
mutate (split = sample(c(0,1) , size = nrow(abl.cs) , replace =TRUE, prob = c(0.30 , 0.70 )))
str(abl.cs)

## 'data.frame':    3781 obs. of  11 variables:
##  $ sex       : Factor w/ 3 levels "F","I","M": 3 3 1 3 2 2 3 1 3 3 ...
##  $ length    : num  -0.5922 -1.5338 0.0803 -0.7267 -1.7131 ...
##  $ diameter  : num  -0.433 -1.517 0.163 -0.433 -1.626 ...
##  $ height    : num  -1.1988 -1.3405 -0.0658 -0.3491 -1.6237 ...
##  $ whole_wt  : num  -0.625 -1.274 -0.259 -0.621 -1.32 ...
##  $ shucked_wt: num  -0.604 -1.219 -0.447 -0.649 -1.268 ...
##  $ viscera_wt: num  -0.719 -1.237 -0.319 -0.591 -1.326 ...
##  $ shell_wt  : num  -0.619 -1.271 -0.131 -0.579 -1.393 ...
##  $ rings     : num  2.39 -1.043 -0.185 0.244 -1.043 ...
##  $ age       : num  2.39 -1.043 -0.185 0.244 -1.043 ...
##  $ split     : num  1 0 1 0 1 1 1 1 0 1 ...

train  = subset(abl.cs , split== 1)
test = subset (abl.cs , split == 0)

ggplot(abl.cs , aes(as.factor(split) , fill= sex ))+
geom_bar(start = "identity" ,position = "dodge" )+

theme_minimal()+
ggtitle("Splitting data into test and train")+
xlab("Split dataset")

## Warning: Ignoring unknown parameters: start

# The OLS we are aiming for is :age of abalone which is determined by the count of rings hence we can say that age is dependent on the no of rings which is 1.5 * rings . Hence we can excluse the age variable for now in the model and consider as ring as explanatory variable i.e Regressing Rings~ sex + length + diameter + height +whole_wt+  vicera_wt + shucked_wt + shell_wt


ols1 = lm(rings ~ sex + length , data = train)
ols2 = lm(rings ~ sex +length + diameter, data = train)
ols3 = lm(rings ~ sex + length + diameter + height, data = train)
ols4 = lm(rings ~ sex + length + diameter +height + whole_wt, data = train)
ols5 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt, data = train)
ols6 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt, data = train)
ols7 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt + viscera_wt , data = train)
ols8 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt +viscera_wt + shell_wt, data = train)

export_summs(ols1 , ols2 , ols3 , ols4 , ols5 , ols6 , ols7 , ols8)

	Model 1	Model 2	Model 3	Model 4	Model 5	Model 6	Model 7	Model 8
(Intercept)	0.18 ***	0.16 ***	0.13 ***	0.14 ***	0.09 ***	0.09 ***	0.10 ***	0.10 ***
	(0.03)	(0.03)	(0.03)	(0.03)	(0.03)	(0.03)	(0.03)	(0.03)
sexI	-0.50 ***	-0.45 ***	-0.37 ***	-0.40 ***	-0.32 ***	-0.32 ***	-0.33 ***	-0.34 ***
	(0.04)	(0.04)	(0.04)	(0.04)	(0.04)	(0.04)	(0.04)	(0.04)
sexM	-0.05	-0.04	-0.02	-0.01	0.03	0.03	0.03	0.03
	(0.04)	(0.04)	(0.04)	(0.04)	(0.03)	(0.03)	(0.03)	(0.03)
length	0.47 ***	-0.21 *	-0.33 ***	-0.20 *	-0.04	-0.04	-0.01	0.02
	(0.02)	(0.09)	(0.09)	(0.09)	(0.09)	(0.09)	(0.09)	(0.09)
diameter		0.70 ***	0.50 ***	0.57 ***	0.47 ***	0.47 ***	0.44 ***	0.36 ***
		(0.09)	(0.09)	(0.09)	(0.09)	(0.09)	(0.09)	(0.09)
height			0.37 ***	0.44 ***	0.31 ***	0.31 ***	0.31 ***	0.27 ***
			(0.04)	(0.04)	(0.03)	(0.03)	(0.03)	(0.04)
whole_wt				-0.28 ***	1.01 ***	1.01 ***	1.24 ***	0.68 ***
				(0.05)	(0.08)	(0.08)	(0.10)	(0.13)
shucked_wt					-1.25 ***	-1.25 ***	-1.29 ***	-1.08 ***
					(0.06)	(0.06)	(0.06)	(0.07)
viscera_wt							-0.21 ***	-0.13 *
							(0.05)	(0.05)
shell_wt								0.37 ***
								(0.06)
N	2661	2661	2661	2661	2661	2661	2661	2661
R2	0.38	0.39	0.42	0.42	0.50	0.50	0.51	0.51
* p < 0.001; p < 0.01; * p < 0.05.

sum_model =export_summs(ols1 , ols2 , ols3 , ols4 , ols5 , ols6 , ols7 , ols8)

R =sum_model  %>% filter(names ==c("R2" ) )  
Rsq=as.numeric(R)

## Warning: NAs introduced by coercion

Rsq=na.omit(as.numeric(R))

## Warning in na.omit(as.numeric(R)): NAs introduced by coercion

sum_model_col=colnames(sum_model)
r2_vs_model=data.frame(x1=sum_model_col[-1] , y1 =Rsq)
r2_vs_model

x1	y1
Model 1	0.379
Model 2	0.392
Model 3	0.416
Model 4	0.424
Model 5	0.502
Model 6	0.502
Model 7	0.505
Model 8	0.512

#viz
ggplot(r2_vs_model , aes (x1 , y1, colour= "red" , group = 1))+geom_point() +geom_line() +labs(title =" Rsq vs ADDITIVE MODELS" ,x = "Models" , y= "R^2")

## lets see a summary of selected model
summary(ols8)

## 
## Call:
## lm(formula = rings ~ sex + length + diameter + height + whole_wt + 
##     shucked_wt + viscera_wt + shell_wt, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78920 -0.47274 -0.09323  0.36280  2.99538 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.09893    0.02587   3.824 0.000134 ***
## sexI        -0.33772    0.04067  -8.305  < 2e-16 ***
## sexM         0.02968    0.03357   0.884 0.376739    
## length       0.01643    0.08582   0.191 0.848190    
## diameter     0.36022    0.08665   4.157 3.32e-05 ***
## height       0.27265    0.03539   7.705 1.84e-14 ***
## whole_wt     0.68289    0.13447   5.079 4.07e-07 ***
## shucked_wt  -1.07726    0.07153 -15.060  < 2e-16 ***
## viscera_wt  -0.12958    0.05450  -2.378 0.017494 *  
## shell_wt     0.37409    0.06275   5.962 2.82e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7001 on 2651 degrees of freedom
## Multiple R-squared:  0.5116, Adjusted R-squared:  0.5099 
## F-statistic: 308.5 on 9 and 2651 DF,  p-value: < 2.2e-16

#VIF (variance of inflation) test
as.matrix(faraway::vif(ols8))

##                 [,1]
## sexI        1.991842
## sexM        1.407564
## length     39.153577
## diameter   39.936756
## height      6.613850
## whole_wt   95.417651
## shucked_wt 26.854677
## viscera_wt 15.718910
## shell_wt   20.879258

#check for variablity in highly correlated variable .
ols_correlations(ols8)

Zero-order	Partial	Part
-0.472	-0.159	-0.113
0.206	0.0172	0.012
0.586	0.00372	0.0026
0.602	0.0805	0.0564
0.618	0.148	0.105
0.56	0.0982	0.0689
0.467	-0.281	-0.204
0.546	-0.0461	-0.0323
0.623	0.115	0.0809

##model without whole_wt variable
ols8_model1 =lm(rings ~ sex + length + diameter + height + shucked_wt +viscera_wt + shell_wt, data = train)
summary(ols8_model1)

## 
## Call:
## lm(formula = rings ~ sex + length + diameter + height + shucked_wt + 
##     viscera_wt + shell_wt, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.73098 -0.47329 -0.08827  0.36729  3.00320 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.10440    0.02597   4.020 5.98e-05 ***
## sexI        -0.35081    0.04077  -8.604  < 2e-16 ***
## sexM         0.02578    0.03371   0.765    0.445    
## length       0.02751    0.08619   0.319    0.750    
## diameter     0.36192    0.08705   4.158 3.32e-05 ***
## height       0.27593    0.03555   7.763 1.18e-14 ***
## shucked_wt  -0.78225    0.04194 -18.652  < 2e-16 ***
## viscera_wt   0.03362    0.04422   0.760    0.447    
## shell_wt     0.59630    0.04518  13.198  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7033 on 2652 degrees of freedom
## Multiple R-squared:  0.5068, Adjusted R-squared:  0.5053 
## F-statistic: 340.7 on 8 and 2652 DF,  p-value: < 2.2e-16

####Variance inflation factor of the additive model without the Whole_weight
as.matrix(faraway::vif(ols8_model1))

##                 [,1]
## sexI        1.983840
## sexM        1.406829
## length     39.128250
## diameter   39.936157
## height      6.611648
## shucked_wt  9.145049
## viscera_wt 10.253948
## shell_wt   10.725643

ols_correlations(ols8_model1)

Zero-order	Partial	Part
-0.472	-0.165	-0.117
0.206	0.0148	0.0104
0.586	0.0062	0.00435
0.602	0.0805	0.0567
0.618	0.149	0.106
0.467	-0.341	-0.254
0.546	0.0148	0.0104
0.623	0.248	0.18

##model without whoe_wt and diameter variable
ols8_model2 =lm(rings ~ sex + length +height + shucked_wt +viscera_wt + shell_wt, data = train)

####Variance inflation factor of the additive model without the Whole_weight and diameter
as.matrix(faraway::vif(ols8_model2))

##                 [,1]
## sexI        1.961260
## sexM        1.405048
## length     10.215753
## height      6.530751
## shucked_wt  9.137632
## viscera_wt 10.212159
## shell_wt   10.252933

summary(ols8_model2)

## 
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + viscera_wt + 
##     shell_wt, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7335 -0.4753 -0.1042  0.3650  3.0309 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.11230    0.02598   4.323 1.60e-05 ***
## sexI        -0.36889    0.04066  -9.072  < 2e-16 ***
## sexM         0.02079    0.03380   0.615    0.538    
## length       0.33555    0.04417   7.596 4.21e-14 ***
## height       0.29228    0.03544   8.248 2.51e-16 ***
## shucked_wt  -0.77729    0.04205 -18.485  < 2e-16 ***
## viscera_wt   0.02188    0.04427   0.494    0.621    
## shell_wt     0.63574    0.04431  14.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7055 on 2653 degrees of freedom
## Multiple R-squared:  0.5036, Adjusted R-squared:  0.5023 
## F-statistic: 384.5 on 7 and 2653 DF,  p-value: < 2.2e-16

anova(ols8_model2 , ols8)

Res.Df	RSS	Df	Sum of Sq	F	Pr(>F)
2.65e+03	1.32e+03
2.65e+03	1.3e+03	2	21.2	21.6	4.86e-10

summary (ols8_model2)

## 
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + viscera_wt + 
##     shell_wt, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7335 -0.4753 -0.1042  0.3650  3.0309 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.11230    0.02598   4.323 1.60e-05 ***
## sexI        -0.36889    0.04066  -9.072  < 2e-16 ***
## sexM         0.02079    0.03380   0.615    0.538    
## length       0.33555    0.04417   7.596 4.21e-14 ***
## height       0.29228    0.03544   8.248 2.51e-16 ***
## shucked_wt  -0.77729    0.04205 -18.485  < 2e-16 ***
## viscera_wt   0.02188    0.04427   0.494    0.621    
## shell_wt     0.63574    0.04431  14.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7055 on 2653 degrees of freedom
## Multiple R-squared:  0.5036, Adjusted R-squared:  0.5023 
## F-statistic: 384.5 on 7 and 2653 DF,  p-value: < 2.2e-16

##After removing significant multicoliniearity we can vicera_wt becomes insignificant and hence can be removed

ols8_model3 =lm(rings ~ sex + length +height + shucked_wt + shell_wt, data = train)
summary(ols8_model3)

## 
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + shell_wt, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7481 -0.4729 -0.1040  0.3689  3.0339 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.11314    0.02592   4.365 1.32e-05 ***
## sexI        -0.37097    0.04044  -9.173  < 2e-16 ***
## sexM         0.02038    0.03378   0.603    0.546    
## length       0.33836    0.04380   7.725 1.58e-14 ***
## height       0.29458    0.03512   8.387  < 2e-16 ***
## shucked_wt  -0.76771    0.03731 -20.576  < 2e-16 ***
## shell_wt     0.64207    0.04241  15.139  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7054 on 2654 degrees of freedom
## Multiple R-squared:  0.5036, Adjusted R-squared:  0.5024 
## F-statistic: 448.7 on 6 and 2654 DF,  p-value: < 2.2e-16

par(mfrow = c(2,2))
plot(ols8_model3)

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

lmtest::dwtest(ols8_model3)

## 
##  Durbin-Watson test
## 
## data:  ols8_model3
## DW = 1.4215, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

## Detecting leverages and influencers
autoplot(ols8_model3, which =4:6,  nrow =3,  ncol =1
)

names(summary(ols8_model3))

##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"

ols8_model3 %>%
   augment() %>% 
   dplyr::select(rings , .hat , .cooksd) %>% 
   arrange(desc(.cooksd))  %>% 
   head()

rings	.hat	.cooksd
-1.47	0.00692	0.0152
-0.614	0.00891	0.0148
-1.9	0.0215	0.00987
1.53	0.00846	0.00889
1.1	0.0182	0.00837
2.39	0.00409	0.00674

## removing the most influencial data

train_1= train  %>% 
filter( rings != c(-0.6139 ,-1.4721 , 1.1025 )) 


ols8_model4 =lm(rings ~ sex + length +height + shucked_wt + shell_wt, data = train_1)
summary(ols8_model4)

## 
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + shell_wt, 
##     data = train_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7481 -0.4729 -0.1040  0.3689  3.0339 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.11314    0.02592   4.365 1.32e-05 ***
## sexI        -0.37097    0.04044  -9.173  < 2e-16 ***
## sexM         0.02038    0.03378   0.603    0.546    
## length       0.33836    0.04380   7.725 1.58e-14 ***
## height       0.29458    0.03512   8.387  < 2e-16 ***
## shucked_wt  -0.76771    0.03731 -20.576  < 2e-16 ***
## shell_wt     0.64207    0.04241  15.139  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7054 on 2654 degrees of freedom
## Multiple R-squared:  0.5036, Adjusted R-squared:  0.5024 
## F-statistic: 448.7 on 6 and 2654 DF,  p-value: < 2.2e-16

## Drop the variables from the test dataset as per the train model.
test_1 = test %>% dplyr::select(sex ,length ,height , shucked_wt , shell_wt, rings)
train_1 = train %>%  dplyr:: select(sex ,length ,height , shucked_wt , shell_wt, rings)
set.seed(900)
## Take random observation fro test data
test_5 =sample_n(test_1 , 10)


## predicting avalone rings
no.rings_pred. =  predict(ols8_model4 , test_5 , interval = "prediction")
no.rings_obs. = abl.cs$rings

## Destandarization of ring variable
targetmean = mean(abl_num_noout$rings)
targetsd = sd(abl_num_noout$rings)
unscaledtest.obs = round(no.rings_obs. *targetsd + targetmean , 0)
unscaledtest.pred = round (no.rings_pred. *targetsd + targetmean , 0)
obs.age = unscaledtest.obs * 1.5
pred.age = unscaledtest.pred * 1.5


cbind(observed_no.rings = unscaledtest.obs ,predicted =unscaledtest.pred ,observed_age = obs.age , pred.age ) %>%  head(10)

## Warning in cbind(observed_no.rings = unscaledtest.obs, predicted =
## unscaledtest.pred, : number of rows of result is not a multiple of vector length
## (arg 1)

##    observed_no.rings fit lwr upr observed_age  fit  lwr  upr
## 1                 15  10   7  13         22.5 15.0 10.5 19.5
## 2                  7  10   6  13         10.5 15.0  9.0 19.5
## 3                  9  11   8  14         13.5 16.5 12.0 21.0
## 4                 10   9   5  12         15.0 13.5  7.5 18.0
## 5                  7  12   8  15         10.5 18.0 12.0 22.5
## 6                  8   8   4  11         12.0 12.0  6.0 16.5
## 7                  9   9   6  13         13.5 13.5  9.0 19.5
## 8                 14  12   8  15         21.0 18.0 12.0 22.5
## 9                 10  10   7  13         15.0 15.0 10.5 19.5
## 10                11   8   4  11         16.5 12.0  6.0 16.5

#exp(predict(ols8_model3, newdata=new_data, interval="confidence"))

rmse_trainols8_model=sqrt(mean((train$rings  - predict(ols8 , train))^2))
rmse_testols8_model = sqrt(mean((test$rings  - predict(ols8 , test))^2))


rmse_trainols8_model4=sqrt(mean((train_1$rings  - predict(ols8_model4 , train_1))^2))
rmse_testols8_model4 = sqrt(mean((test_1$rings  - predict(ols8_model4 , test_1))^2))


data.frame(model =c("ols8","ols8_model4") ,rmse_train = c(rmse_trainols8_model ,rmse_trainols8_model4) , rmse_test =c(rmse_testols8_model ,rmse_trainols8_model4 ))

model	rmse_train	rmse_test
ols8	0.699	0.707
ols8_model4	0.704	0.704

Predicting-age-of-abalone-using-Multiple-Linear-Regression.R

anjal

2022-10-26