suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(ggplot2))
#install.packages('corrplot')
suppressPackageStartupMessages(library(corrplot))
#install.packages('moments')
suppressPackageStartupMessages(library(moments))
#install.packages('GGally')
suppressPackageStartupMessages(library(GGally))
#install.packages('faraway')
suppressPackageStartupMessages(library(faraway))
#install.packages('olsrr')
suppressPackageStartupMessages(library(olsrr))
#install.packages('lmtest')
suppressPackageStartupMessages(library(MASS))
#install.packages('MASS')
#install.packages('ggfortify')
suppressPackageStartupMessages(library(ggfortify))
#install.packages('broom')
suppressPackageStartupMessages(library(broom))
#install.packages('jtools')
#install.packages('huxtable')
suppressPackageStartupMessages(library(huxtable))
suppressPackageStartupMessages(library(jtools))
#install.packages('cowplot')
suppressPackageStartupMessages(library(cowplot))






abalone <- readr::read_csv('C:/personal files/data analytics/docs/git proj/abalone age prediction/Data-Science-Project/abalone.csv', show_col_types = FALSE)


head(abalone)
sexlengthdiameterheightwhole_wtshucked_wtviscera_wtshell_wtringsage
M0.4550.3650.0950.5140.224 0.101 0.15 1516.5
M0.35 0.2650.09 0.2260.09950.04850.07 78.5
F0.53 0.42 0.1350.6770.257 0.141 0.21 910.5
M0.44 0.3650.1250.5160.215 0.114 0.1551011.5
I0.33 0.2550.08 0.2050.08950.03950.05578.5
I0.4250.3  0.0950.3510.141 0.07750.12 89.5
str(abalone)
## spec_tbl_df [4,177 x 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ sex       : chr [1:4177] "M" "M" "F" "M" ...
##  $ length    : num [1:4177] 0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
##  $ diameter  : num [1:4177] 0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
##  $ height    : num [1:4177] 0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
##  $ whole_wt  : num [1:4177] 0.514 0.226 0.677 0.516 0.205 ...
##  $ shucked_wt: num [1:4177] 0.2245 0.0995 0.2565 0.2155 0.0895 ...
##  $ viscera_wt: num [1:4177] 0.101 0.0485 0.1415 0.114 0.0395 ...
##  $ shell_wt  : num [1:4177] 0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
##  $ rings     : num [1:4177] 15 7 9 10 7 8 20 16 9 19 ...
##  $ age       : num [1:4177] 16.5 8.5 10.5 11.5 8.5 9.5 21.5 17.5 10.5 20.5 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   sex = col_character(),
##   ..   length = col_double(),
##   ..   diameter = col_double(),
##   ..   height = col_double(),
##   ..   whole_wt = col_double(),
##   ..   shucked_wt = col_double(),
##   ..   viscera_wt = col_double(),
##   ..   shell_wt = col_double(),
##   ..   rings = col_double(),
##   ..   age = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
dim(abalone)
## [1] 4177   10
##Convering sex variable to factor type
abalone$sex = as.factor(abalone$sex)
glimpse(abalone)
## Rows: 4,177
## Columns: 10
## $ sex        <fct> M, M, F, M, I, I, F, F, M, F, F, M, M, F, F, M, I, F, M, M,~
## $ length     <dbl> 0.455, 0.350, 0.530, 0.440, 0.330, 0.425, 0.530, 0.545, 0.4~
## $ diameter   <dbl> 0.365, 0.265, 0.420, 0.365, 0.255, 0.300, 0.415, 0.425, 0.3~
## $ height     <dbl> 0.095, 0.090, 0.135, 0.125, 0.080, 0.095, 0.150, 0.125, 0.1~
## $ whole_wt   <dbl> 0.5140, 0.2255, 0.6770, 0.5160, 0.2050, 0.3515, 0.7775, 0.7~
## $ shucked_wt <dbl> 0.2245, 0.0995, 0.2565, 0.2155, 0.0895, 0.1410, 0.2370, 0.2~
## $ viscera_wt <dbl> 0.1010, 0.0485, 0.1415, 0.1140, 0.0395, 0.0775, 0.1415, 0.1~
## $ shell_wt   <dbl> 0.150, 0.070, 0.210, 0.155, 0.055, 0.120, 0.330, 0.260, 0.1~
## $ rings      <dbl> 15, 7, 9, 10, 7, 8, 20, 16, 9, 19, 14, 10, 11, 10, 10, 12, ~
## $ age        <dbl> 16.5, 8.5, 10.5, 11.5, 8.5, 9.5, 21.5, 17.5, 10.5, 20.5, 15~
summary(abalone)
##  sex          length         diameter          height          whole_wt     
##  F:1307   Min.   :0.075   Min.   :0.0550   Min.   :0.0000   Min.   :0.0020  
##  I:1342   1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150   1st Qu.:0.4415  
##  M:1528   Median :0.545   Median :0.4250   Median :0.1400   Median :0.7995  
##           Mean   :0.524   Mean   :0.4079   Mean   :0.1395   Mean   :0.8287  
##           3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650   3rd Qu.:1.1530  
##           Max.   :0.815   Max.   :0.6500   Max.   :1.1300   Max.   :2.8255  
##    shucked_wt       viscera_wt        shell_wt          rings       
##  Min.   :0.0010   Min.   :0.0005   Min.   :0.0015   Min.   : 1.000  
##  1st Qu.:0.1860   1st Qu.:0.0935   1st Qu.:0.1300   1st Qu.: 8.000  
##  Median :0.3360   Median :0.1710   Median :0.2340   Median : 9.000  
##  Mean   :0.3594   Mean   :0.1806   Mean   :0.2388   Mean   : 9.934  
##  3rd Qu.:0.5020   3rd Qu.:0.2530   3rd Qu.:0.3290   3rd Qu.:11.000  
##  Max.   :1.4880   Max.   :0.7600   Max.   :1.0050   Max.   :29.000  
##       age       
##  Min.   : 2.50  
##  1st Qu.: 9.50  
##  Median :10.50  
##  Mean   :11.43  
##  3rd Qu.:12.50  
##  Max.   :30.50
# seperating categorical variable
abl_cat =abalone %>% 
dplyr::select(sex)  %>% 
 group_by(sex)


#analysing proportion of data on the basis of sex
sex_prop =abl_cat %>%
summarise(count_n=n())%>%
mutate(prop.= paste0(round(count_n/sum(count_n)*100 , 2) , "%" ))
print(sex_prop)
## # A tibble: 3 x 3
##   sex   count_n prop. 
##   <fct>   <int> <chr> 
## 1 F        1307 31.29%
## 2 I        1342 32.13%
## 3 M        1528 36.58%
# This is a chart, switch to the DataCamp editor to view and configure it.


missing_data= function(x) {
    name_var= c()
    missing_values = c()
    for (i in 1: x ){
     name_var[i]=(names(abalone[i]))
     missing_values[i] =(sum(is.na(abalone[i])))
     }
  return(data.frame(name_var, missing_values ))
    
}

as.data.frame(lapply((ncol(abalone)) , missing_data))
name_varmissing_values
sex0
length0
diameter0
height0
whole_wt0
shucked_wt0
viscera_wt0
shell_wt0
rings0
age0
dup_data= sum(duplicated(abalone))
paste0("There are " ,dup_data , " " , "full duplicates in the dataset")
## [1] "There are 0 full duplicates in the dataset"
ggplot(stack(abalone), aes(x = ind, y = values , color= ind) )+
  geom_boxplot()+
  labs(title ="Boxplot")
## Warning in stack.data.frame(abalone): non-vector columns will be ignored

##standarization of data.
std_num_abl= function(y) {
    
    standarization =(y - mean(y)) / sd(y)
    return(standarization)
}

abl_num=as.data.frame(lapply(abalone[, -1],std_num_abl ))
ggplot(stack(abl_num), aes(x = ind, y = values , color= ind) )+
  geom_boxplot()+
  labs(title ="Boxplot")

abl_sum=as.data.frame(sapply(abalone[,-1], summary))
abl_sum
lengthdiameterheightwhole_wtshucked_wtviscera_wtshell_wtringsage
0.0750.0550    0.0020.0010.00050.00151   2.5
0.45 0.35 0.1150.4420.1860.09350.13  8   9.5
0.5450.4250.14 0.7990.3360.171 0.234 9   10.5
0.5240.4080.14 0.8290.3590.181 0.239 9.9311.4
0.6150.48 0.1651.15 0.5020.253 0.329 11   12.5
0.8150.65 1.13 2.83 1.49 0.76  1     29   30.5
abl_skew = as.data.frame(sapply (abalone[,-1], skewness))
abl_kur = as.data.frame (sapply(abalone[,-1], kurtosis))
abl_var = as.data.frame(sapply(abalone[,-1] , var))

as.data.frame(cbind(var = abl_var, skew= abl_skew , abl_kur))
sapply(abalone[, -1], var)sapply(abalone[, -1], skewness)sapply(abalone[, -1], kurtosis)
0.0144 -0.64 3.06
0.00985-0.6092.95
0.001753.13 78.9 
0.24   0.5312.97
0.0493 0.7193.59
0.012  0.5923.08
0.0194 0.6213.53
10.4    1.11 5.33
10.4    1.11 5.33
#REMOVING OUTLIER 
abl_numerical = as.data.frame(abalone[,-1])

v= list()

abl_outliers=for (i in 1:ncol(abl_num)){
    
     name_out= (names(abl_numerical))[i]
    abliout = boxplot.stats(abl_numerical[,i])$out
    v[[(names(abl_numerical))[i]]]=  boxplot.stats(abl_numerical[,i])$out
    len_out = length(abliout)
    print(c(name_out ,paste("number of outliers" , len_out)))
    print(sort(abliout))

}
## [1] "length"                "number of outliers 49"
##  [1] 0.075 0.110 0.130 0.130 0.135 0.140 0.140 0.150 0.155 0.155 0.155 0.160
## [13] 0.160 0.160 0.160 0.165 0.165 0.165 0.165 0.165 0.170 0.170 0.170 0.175
## [25] 0.175 0.175 0.175 0.175 0.180 0.180 0.180 0.180 0.185 0.185 0.185 0.185
## [37] 0.190 0.190 0.190 0.190 0.195 0.195 0.195 0.200 0.200 0.200 0.200 0.200
## [49] 0.200
## [1] "diameter"              "number of outliers 59"
##  [1] 0.055 0.090 0.095 0.100 0.100 0.105 0.105 0.105 0.105 0.110 0.110 0.110
## [13] 0.110 0.115 0.115 0.120 0.120 0.120 0.120 0.120 0.125 0.125 0.125 0.125
## [25] 0.125 0.125 0.125 0.130 0.130 0.130 0.130 0.130 0.130 0.130 0.130 0.135
## [37] 0.135 0.135 0.135 0.135 0.140 0.140 0.140 0.145 0.145 0.145 0.145 0.145
## [49] 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150
## [1] "height"                "number of outliers 29"
##  [1] 0.000 0.000 0.010 0.015 0.015 0.020 0.020 0.025 0.025 0.025 0.025 0.025
## [13] 0.030 0.030 0.030 0.030 0.030 0.030 0.035 0.035 0.035 0.035 0.035 0.035
## [25] 0.250 0.250 0.250 0.515 1.130
## [1] "whole_wt"              "number of outliers 30"
##  [1] 2.2205 2.2260 2.2305 2.2355 2.2360 2.2385 2.2500 2.2550 2.2635 2.2695
## [11] 2.2730 2.3020 2.3235 2.3305 2.3330 2.3810 2.3810 2.3980 2.4925 2.4990
## [21] 2.5050 2.5085 2.5155 2.5260 2.5480 2.5500 2.5550 2.6570 2.7795 2.8255
## [1] "shucked_wt"            "number of outliers 48"
##  [1] 0.9815 0.9840 0.9895 0.9915 0.9925 0.9955 1.0050 1.0070 1.0120 1.0135
## [11] 1.0170 1.0260 1.0265 1.0300 1.0465 1.0465 1.0515 1.0615 1.0685 1.0705
## [21] 1.0715 1.0745 1.0815 1.0830 1.0950 1.1055 1.1075 1.1090 1.1155 1.1280
## [31] 1.1330 1.1335 1.1455 1.1455 1.1465 1.1495 1.1565 1.1705 1.1945 1.1965
## [41] 1.2320 1.2395 1.2395 1.2455 1.2530 1.3485 1.3510 1.4880
## [1] "viscera_wt"            "number of outliers 26"
##  [1] 0.4925 0.4985 0.5005 0.5090 0.5115 0.5120 0.5130 0.5145 0.5185 0.5190
## [11] 0.5195 0.5195 0.5225 0.5235 0.5250 0.5260 0.5265 0.5410 0.5410 0.5500
## [21] 0.5640 0.5745 0.5750 0.5900 0.6415 0.7600
## [1] "shell_wt"              "number of outliers 35"
##  [1] 0.6300 0.6350 0.6380 0.6420 0.6430 0.6460 0.6500 0.6550 0.6570 0.6585
## [11] 0.6600 0.6650 0.6650 0.6745 0.6750 0.6785 0.6850 0.6855 0.6900 0.7000
## [21] 0.7100 0.7100 0.7250 0.7250 0.7250 0.7260 0.7600 0.7800 0.7975 0.8150
## [31] 0.8500 0.8850 0.8850 0.8970 1.0050
## [1] "rings"                  "number of outliers 278"
##   [1]  1  2  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3 16 16 16 16 16 16 16 16
##  [26] 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
##  [51] 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
##  [76] 16 16 16 16 16 16 16 16 16 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17
## [101] 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17
## [126] 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 18 18 18 18 18 18 18 18
## [151] 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18
## [176] 18 18 18 18 18 18 18 18 18 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19
## [201] 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 20 20 20 20 20 20 20 20 20
## [226] 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 21 21 21 21 21 21 21 21
## [251] 21 21 21 21 21 21 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23 24 24 25 26
## [276] 27 27 29
## [1] "age"                    "number of outliers 278"
##   [1]  2.5  3.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5
##  [16]  4.5  4.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
##  [31] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
##  [46] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
##  [61] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
##  [76] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 18.5 18.5 18.5 18.5 18.5 18.5
##  [91] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5
## [106] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5
## [121] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5
## [136] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5
## [151] 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5
## [166] 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5
## [181] 19.5 19.5 19.5 19.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5
## [196] 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5
## [211] 20.5 20.5 20.5 20.5 20.5 20.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5
## [226] 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5
## [241] 21.5 21.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5
## [256] 22.5 23.5 23.5 23.5 23.5 23.5 23.5 24.5 24.5 24.5 24.5 24.5 24.5 24.5 24.5
## [271] 24.5 25.5 25.5 26.5 27.5 28.5 28.5 30.5
abl_num_noout= abalone %>% filter( !length  %in% v$length , !diameter  %in% v$diameter ,!height  %in% v$height ,
                         !whole_wt %in% v$whole_wt , !shell_wt %in% v$shell_wt ,!shucked_wt %in% v$shucked_wt ,
                         !viscera_wt %in% v$viscera_wt, !rings %in% v$rings)


head(abl_num_noout)
sexlengthdiameterheightwhole_wtshucked_wtviscera_wtshell_wtringsage
M0.4550.3650.0950.5140.224 0.101 0.15 1516.5
M0.35 0.2650.09 0.2260.09950.04850.07 78.5
F0.53 0.42 0.1350.6770.257 0.141 0.21 910.5
M0.44 0.3650.1250.5160.215 0.114 0.1551011.5
I0.33 0.2550.08 0.2050.08950.03950.05578.5
I0.4250.3  0.0950.3510.141 0.07750.12 89.5
paste("Total no of rows with outliers removed" ,dim(abl_numerical)[1]-dim(abl_num_noout)[1] , "hence dimension of cleaned dataset ", dim(abl_num_noout)[1] ,"x", dim(abl_num_noout)[2])
## [1] "Total no of rows with outliers removed 396 hence dimension of cleaned dataset  3781 x 10"
std_num_abl= function(y) {
    
    standarization =(y - mean(y)) / sd(y)
    return(standarization)
}

 


x= as.data.frame(lapply(abl_num_noout[,-1] ,std_num_abl ))

abl.cs = cbind(abl_num_noout[,1],x  )

summary(abl.cs)
##  sex          length           diameter           height        
##  F:1160   Min.   :-2.8340   Min.   :-2.7093   Min.   :-2.75671  
##  I:1261   1st Qu.:-0.6370   1st Qu.:-0.6501   1st Qu.:-0.77397  
##  M:1360   Median : 0.1252   Median : 0.1627   Median : 0.07578  
##           Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##           3rd Qu.: 0.7977   3rd Qu.: 0.7588   3rd Qu.: 0.78390  
##           Max.   : 2.1428   Max.   : 2.1135   Max.   : 2.90826  
##     whole_wt          shucked_wt        viscera_wt          shell_wt       
##  Min.   :-1.68568   Min.   :-1.6240   Min.   :-1.71107   Min.   :-1.73484  
##  1st Qu.:-0.80756   1st Qu.:-0.8181   1st Qu.:-0.82282   1st Qu.:-0.82296  
##  Median :-0.05762   Median :-0.1007   Median :-0.09743   Median :-0.04949  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.73167   3rd Qu.: 0.7101   3rd Qu.: 0.69212   3rd Qu.: 0.71991  
##  Max.   : 3.00287   Max.   : 3.0098   Max.   : 3.13973   Max.   : 3.24792  
##      rings              age         
##  Min.   :-2.3303   Min.   :-2.3303  
##  1st Qu.:-0.6139   1st Qu.:-0.6139  
##  Median :-0.1848   Median :-0.1848  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6734   3rd Qu.: 0.6734  
##  Max.   : 2.3899   Max.   : 2.3899
GGally::ggpairs(abl.cs, aes( color = sex ,alpha = 0.8 ), title = "Pairs plot for abalone dataset"  )+ ggplot2::theme_grey(base_size = 2) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

abalone_corr = abl.cs[,c(-1, -9)]

corrplot(cor(abalone_corr) ,type = "lower" ,main="\nCorrelation matrix" , addCoef.col = 'red', number.cex=1.0 , 
        , tl.srt = 45)

par(mfrow = c(2, 2 ))
age_wholewt = ggplot (abl.cs , aes(whole_wt , age , colour = sex))+
geom_point()+geom_smooth (method  = "lm")+
labs(title =  "Relation between age vs whole_wt") 

age_shuckedwt = ggplot (abl.cs , aes(shucked_wt , age , colour = sex))+
geom_point()+geom_smooth (method  = "lm")+
labs(title =  "Relation between age vs shucked_wt")

age_shellwt = ggplot (abl.cs , aes(shell_wt , age , colour = sex))+
geom_point()+geom_smooth (method  = "lm")+
labs(title =  "Relation between age vs shell_wt")

plot_grid(age_wholewt , age_shellwt , age_shuckedwt , labels ="AUTO" )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

set.seed(90)
#Splitting data
abl.cs = abl.cs  %>%  
mutate (split = sample(c(0,1) , size = nrow(abl.cs) , replace =TRUE, prob = c(0.30 , 0.70 )))
str(abl.cs)
## 'data.frame':    3781 obs. of  11 variables:
##  $ sex       : Factor w/ 3 levels "F","I","M": 3 3 1 3 2 2 3 1 3 3 ...
##  $ length    : num  -0.5922 -1.5338 0.0803 -0.7267 -1.7131 ...
##  $ diameter  : num  -0.433 -1.517 0.163 -0.433 -1.626 ...
##  $ height    : num  -1.1988 -1.3405 -0.0658 -0.3491 -1.6237 ...
##  $ whole_wt  : num  -0.625 -1.274 -0.259 -0.621 -1.32 ...
##  $ shucked_wt: num  -0.604 -1.219 -0.447 -0.649 -1.268 ...
##  $ viscera_wt: num  -0.719 -1.237 -0.319 -0.591 -1.326 ...
##  $ shell_wt  : num  -0.619 -1.271 -0.131 -0.579 -1.393 ...
##  $ rings     : num  2.39 -1.043 -0.185 0.244 -1.043 ...
##  $ age       : num  2.39 -1.043 -0.185 0.244 -1.043 ...
##  $ split     : num  1 0 1 0 1 1 1 1 0 1 ...
train  = subset(abl.cs , split== 1)
test = subset (abl.cs , split == 0)

ggplot(abl.cs , aes(as.factor(split) , fill= sex ))+
geom_bar(start = "identity" ,position = "dodge" )+

theme_minimal()+
ggtitle("Splitting data into test and train")+
xlab("Split dataset")
## Warning: Ignoring unknown parameters: start

# The OLS we are aiming for is :age of abalone which is determined by the count of rings hence we can say that age is dependent on the no of rings which is 1.5 * rings . Hence we can excluse the age variable for now in the model and consider as ring as explanatory variable i.e Regressing Rings~ sex + length + diameter + height +whole_wt+  vicera_wt + shucked_wt + shell_wt


ols1 = lm(rings ~ sex + length , data = train)
ols2 = lm(rings ~ sex +length + diameter, data = train)
ols3 = lm(rings ~ sex + length + diameter + height, data = train)
ols4 = lm(rings ~ sex + length + diameter +height + whole_wt, data = train)
ols5 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt, data = train)
ols6 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt, data = train)
ols7 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt + viscera_wt , data = train)
ols8 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt +viscera_wt + shell_wt, data = train)

export_summs(ols1 , ols2 , ols3 , ols4 , ols5 , ols6 , ols7 , ols8)
Model 1Model 2Model 3Model 4Model 5Model 6Model 7Model 8
(Intercept)0.18 ***0.16 ***0.13 ***0.14 ***0.09 ***0.09 ***0.10 ***0.10 ***
(0.03)   (0.03)   (0.03)   (0.03)   (0.03)   (0.03)   (0.03)   (0.03)   
sexI-0.50 ***-0.45 ***-0.37 ***-0.40 ***-0.32 ***-0.32 ***-0.33 ***-0.34 ***
(0.04)   (0.04)   (0.04)   (0.04)   (0.04)   (0.04)   (0.04)   (0.04)   
sexM-0.05    -0.04    -0.02    -0.01    0.03    0.03    0.03    0.03    
(0.04)   (0.04)   (0.04)   (0.04)   (0.03)   (0.03)   (0.03)   (0.03)   
length0.47 ***-0.21 *  -0.33 ***-0.20 *  -0.04    -0.04    -0.01    0.02    
(0.02)   (0.09)   (0.09)   (0.09)   (0.09)   (0.09)   (0.09)   (0.09)   
diameter       0.70 ***0.50 ***0.57 ***0.47 ***0.47 ***0.44 ***0.36 ***
       (0.09)   (0.09)   (0.09)   (0.09)   (0.09)   (0.09)   (0.09)   
height              0.37 ***0.44 ***0.31 ***0.31 ***0.31 ***0.27 ***
              (0.04)   (0.04)   (0.03)   (0.03)   (0.03)   (0.04)   
whole_wt                     -0.28 ***1.01 ***1.01 ***1.24 ***0.68 ***
                     (0.05)   (0.08)   (0.08)   (0.10)   (0.13)   
shucked_wt                            -1.25 ***-1.25 ***-1.29 ***-1.08 ***
                            (0.06)   (0.06)   (0.06)   (0.07)   
viscera_wt                                          -0.21 ***-0.13 *  
                                          (0.05)   (0.05)   
shell_wt                                                 0.37 ***
                                                 (0.06)   
N2661       2661       2661       2661       2661       2661       2661       2661       
R20.38    0.39    0.42    0.42    0.50    0.50    0.51    0.51    
*** p < 0.001; ** p < 0.01; * p < 0.05.
sum_model =export_summs(ols1 , ols2 , ols3 , ols4 , ols5 , ols6 , ols7 , ols8)

R =sum_model  %>% filter(names ==c("R2" ) )  
Rsq=as.numeric(R)
## Warning: NAs introduced by coercion
Rsq=na.omit(as.numeric(R))
## Warning in na.omit(as.numeric(R)): NAs introduced by coercion
sum_model_col=colnames(sum_model)
r2_vs_model=data.frame(x1=sum_model_col[-1] , y1 =Rsq)
r2_vs_model
x1y1
Model 10.379
Model 20.392
Model 30.416
Model 40.424
Model 50.502
Model 60.502
Model 70.505
Model 80.512
#viz
ggplot(r2_vs_model , aes (x1 , y1, colour= "red" , group = 1))+geom_point() +geom_line() +labs(title =" Rsq vs ADDITIVE MODELS" ,x = "Models" , y= "R^2")

## lets see a summary of selected model
summary(ols8)
## 
## Call:
## lm(formula = rings ~ sex + length + diameter + height + whole_wt + 
##     shucked_wt + viscera_wt + shell_wt, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.78920 -0.47274 -0.09323  0.36280  2.99538 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.09893    0.02587   3.824 0.000134 ***
## sexI        -0.33772    0.04067  -8.305  < 2e-16 ***
## sexM         0.02968    0.03357   0.884 0.376739    
## length       0.01643    0.08582   0.191 0.848190    
## diameter     0.36022    0.08665   4.157 3.32e-05 ***
## height       0.27265    0.03539   7.705 1.84e-14 ***
## whole_wt     0.68289    0.13447   5.079 4.07e-07 ***
## shucked_wt  -1.07726    0.07153 -15.060  < 2e-16 ***
## viscera_wt  -0.12958    0.05450  -2.378 0.017494 *  
## shell_wt     0.37409    0.06275   5.962 2.82e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7001 on 2651 degrees of freedom
## Multiple R-squared:  0.5116, Adjusted R-squared:  0.5099 
## F-statistic: 308.5 on 9 and 2651 DF,  p-value: < 2.2e-16
#VIF (variance of inflation) test
as.matrix(faraway::vif(ols8))
##                 [,1]
## sexI        1.991842
## sexM        1.407564
## length     39.153577
## diameter   39.936756
## height      6.613850
## whole_wt   95.417651
## shucked_wt 26.854677
## viscera_wt 15.718910
## shell_wt   20.879258
#check for variablity in highly correlated variable .
ols_correlations(ols8)
Zero-orderPartialPart
-0.472-0.159  -0.113 
0.2060.0172 0.012 
0.5860.003720.0026
0.6020.0805 0.0564
0.6180.148  0.105 
0.56 0.0982 0.0689
0.467-0.281  -0.204 
0.546-0.0461 -0.0323
0.6230.115  0.0809
##model without whole_wt variable
ols8_model1 =lm(rings ~ sex + length + diameter + height + shucked_wt +viscera_wt + shell_wt, data = train)
summary(ols8_model1)
## 
## Call:
## lm(formula = rings ~ sex + length + diameter + height + shucked_wt + 
##     viscera_wt + shell_wt, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.73098 -0.47329 -0.08827  0.36729  3.00320 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.10440    0.02597   4.020 5.98e-05 ***
## sexI        -0.35081    0.04077  -8.604  < 2e-16 ***
## sexM         0.02578    0.03371   0.765    0.445    
## length       0.02751    0.08619   0.319    0.750    
## diameter     0.36192    0.08705   4.158 3.32e-05 ***
## height       0.27593    0.03555   7.763 1.18e-14 ***
## shucked_wt  -0.78225    0.04194 -18.652  < 2e-16 ***
## viscera_wt   0.03362    0.04422   0.760    0.447    
## shell_wt     0.59630    0.04518  13.198  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7033 on 2652 degrees of freedom
## Multiple R-squared:  0.5068, Adjusted R-squared:  0.5053 
## F-statistic: 340.7 on 8 and 2652 DF,  p-value: < 2.2e-16
####Variance inflation factor of the additive model without the Whole_weight
as.matrix(faraway::vif(ols8_model1))
##                 [,1]
## sexI        1.983840
## sexM        1.406829
## length     39.128250
## diameter   39.936157
## height      6.611648
## shucked_wt  9.145049
## viscera_wt 10.253948
## shell_wt   10.725643
ols_correlations(ols8_model1)
Zero-orderPartialPart
-0.472-0.165 -0.117  
0.2060.01480.0104 
0.5860.00620.00435
0.6020.08050.0567 
0.6180.149 0.106  
0.467-0.341 -0.254  
0.5460.01480.0104 
0.6230.248 0.18   
##model without whoe_wt and diameter variable
ols8_model2 =lm(rings ~ sex + length +height + shucked_wt +viscera_wt + shell_wt, data = train)

####Variance inflation factor of the additive model without the Whole_weight and diameter
as.matrix(faraway::vif(ols8_model2))
##                 [,1]
## sexI        1.961260
## sexM        1.405048
## length     10.215753
## height      6.530751
## shucked_wt  9.137632
## viscera_wt 10.212159
## shell_wt   10.252933
summary(ols8_model2)
## 
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + viscera_wt + 
##     shell_wt, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7335 -0.4753 -0.1042  0.3650  3.0309 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.11230    0.02598   4.323 1.60e-05 ***
## sexI        -0.36889    0.04066  -9.072  < 2e-16 ***
## sexM         0.02079    0.03380   0.615    0.538    
## length       0.33555    0.04417   7.596 4.21e-14 ***
## height       0.29228    0.03544   8.248 2.51e-16 ***
## shucked_wt  -0.77729    0.04205 -18.485  < 2e-16 ***
## viscera_wt   0.02188    0.04427   0.494    0.621    
## shell_wt     0.63574    0.04431  14.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7055 on 2653 degrees of freedom
## Multiple R-squared:  0.5036, Adjusted R-squared:  0.5023 
## F-statistic: 384.5 on 7 and 2653 DF,  p-value: < 2.2e-16
anova(ols8_model2 , ols8)
Res.DfRSSDfSum of SqFPr(>F)
2.65e+031.32e+03           
2.65e+031.3e+03 221.221.64.86e-10
summary (ols8_model2)
## 
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + viscera_wt + 
##     shell_wt, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7335 -0.4753 -0.1042  0.3650  3.0309 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.11230    0.02598   4.323 1.60e-05 ***
## sexI        -0.36889    0.04066  -9.072  < 2e-16 ***
## sexM         0.02079    0.03380   0.615    0.538    
## length       0.33555    0.04417   7.596 4.21e-14 ***
## height       0.29228    0.03544   8.248 2.51e-16 ***
## shucked_wt  -0.77729    0.04205 -18.485  < 2e-16 ***
## viscera_wt   0.02188    0.04427   0.494    0.621    
## shell_wt     0.63574    0.04431  14.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7055 on 2653 degrees of freedom
## Multiple R-squared:  0.5036, Adjusted R-squared:  0.5023 
## F-statistic: 384.5 on 7 and 2653 DF,  p-value: < 2.2e-16
##After removing significant multicoliniearity we can vicera_wt becomes insignificant and hence can be removed

ols8_model3 =lm(rings ~ sex + length +height + shucked_wt + shell_wt, data = train)
summary(ols8_model3)
## 
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + shell_wt, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7481 -0.4729 -0.1040  0.3689  3.0339 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.11314    0.02592   4.365 1.32e-05 ***
## sexI        -0.37097    0.04044  -9.173  < 2e-16 ***
## sexM         0.02038    0.03378   0.603    0.546    
## length       0.33836    0.04380   7.725 1.58e-14 ***
## height       0.29458    0.03512   8.387  < 2e-16 ***
## shucked_wt  -0.76771    0.03731 -20.576  < 2e-16 ***
## shell_wt     0.64207    0.04241  15.139  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7054 on 2654 degrees of freedom
## Multiple R-squared:  0.5036, Adjusted R-squared:  0.5024 
## F-statistic: 448.7 on 6 and 2654 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(ols8_model3)

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
lmtest::dwtest(ols8_model3)
## 
##  Durbin-Watson test
## 
## data:  ols8_model3
## DW = 1.4215, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
## Detecting leverages and influencers
autoplot(ols8_model3, which =4:6,  nrow =3,  ncol =1
)

names(summary(ols8_model3))
##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"
ols8_model3 %>%
   augment() %>% 
   dplyr::select(rings , .hat , .cooksd) %>% 
   arrange(desc(.cooksd))  %>% 
   head()
rings.hat.cooksd
-1.47 0.006920.0152 
-0.6140.008910.0148 
-1.9  0.0215 0.00987
1.53 0.008460.00889
1.1  0.0182 0.00837
2.39 0.004090.00674
## removing the most influencial data

train_1= train  %>% 
filter( rings != c(-0.6139 ,-1.4721 , 1.1025 )) 


ols8_model4 =lm(rings ~ sex + length +height + shucked_wt + shell_wt, data = train_1)
summary(ols8_model4)
## 
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + shell_wt, 
##     data = train_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7481 -0.4729 -0.1040  0.3689  3.0339 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.11314    0.02592   4.365 1.32e-05 ***
## sexI        -0.37097    0.04044  -9.173  < 2e-16 ***
## sexM         0.02038    0.03378   0.603    0.546    
## length       0.33836    0.04380   7.725 1.58e-14 ***
## height       0.29458    0.03512   8.387  < 2e-16 ***
## shucked_wt  -0.76771    0.03731 -20.576  < 2e-16 ***
## shell_wt     0.64207    0.04241  15.139  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7054 on 2654 degrees of freedom
## Multiple R-squared:  0.5036, Adjusted R-squared:  0.5024 
## F-statistic: 448.7 on 6 and 2654 DF,  p-value: < 2.2e-16
## Drop the variables from the test dataset as per the train model.
test_1 = test %>% dplyr::select(sex ,length ,height , shucked_wt , shell_wt, rings)
train_1 = train %>%  dplyr:: select(sex ,length ,height , shucked_wt , shell_wt, rings)
set.seed(900)
## Take random observation fro test data
test_5 =sample_n(test_1 , 10)


## predicting avalone rings
no.rings_pred. =  predict(ols8_model4 , test_5 , interval = "prediction")
no.rings_obs. = abl.cs$rings

## Destandarization of ring variable
targetmean = mean(abl_num_noout$rings)
targetsd = sd(abl_num_noout$rings)
unscaledtest.obs = round(no.rings_obs. *targetsd + targetmean , 0)
unscaledtest.pred = round (no.rings_pred. *targetsd + targetmean , 0)
obs.age = unscaledtest.obs * 1.5
pred.age = unscaledtest.pred * 1.5


cbind(observed_no.rings = unscaledtest.obs ,predicted =unscaledtest.pred ,observed_age = obs.age , pred.age ) %>%  head(10)
## Warning in cbind(observed_no.rings = unscaledtest.obs, predicted =
## unscaledtest.pred, : number of rows of result is not a multiple of vector length
## (arg 1)
##    observed_no.rings fit lwr upr observed_age  fit  lwr  upr
## 1                 15  10   7  13         22.5 15.0 10.5 19.5
## 2                  7  10   6  13         10.5 15.0  9.0 19.5
## 3                  9  11   8  14         13.5 16.5 12.0 21.0
## 4                 10   9   5  12         15.0 13.5  7.5 18.0
## 5                  7  12   8  15         10.5 18.0 12.0 22.5
## 6                  8   8   4  11         12.0 12.0  6.0 16.5
## 7                  9   9   6  13         13.5 13.5  9.0 19.5
## 8                 14  12   8  15         21.0 18.0 12.0 22.5
## 9                 10  10   7  13         15.0 15.0 10.5 19.5
## 10                11   8   4  11         16.5 12.0  6.0 16.5
#exp(predict(ols8_model3, newdata=new_data, interval="confidence"))

rmse_trainols8_model=sqrt(mean((train$rings  - predict(ols8 , train))^2))
rmse_testols8_model = sqrt(mean((test$rings  - predict(ols8 , test))^2))


rmse_trainols8_model4=sqrt(mean((train_1$rings  - predict(ols8_model4 , train_1))^2))
rmse_testols8_model4 = sqrt(mean((test_1$rings  - predict(ols8_model4 , test_1))^2))


data.frame(model =c("ols8","ols8_model4") ,rmse_train = c(rmse_trainols8_model ,rmse_trainols8_model4) , rmse_test =c(rmse_testols8_model ,rmse_trainols8_model4 ))
modelrmse_trainrmse_test
ols80.6990.707
ols8_model40.7040.704