suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(ggplot2))
#install.packages('corrplot')
suppressPackageStartupMessages(library(corrplot))
#install.packages('moments')
suppressPackageStartupMessages(library(moments))
#install.packages('GGally')
suppressPackageStartupMessages(library(GGally))
#install.packages('faraway')
suppressPackageStartupMessages(library(faraway))
#install.packages('olsrr')
suppressPackageStartupMessages(library(olsrr))
#install.packages('lmtest')
suppressPackageStartupMessages(library(MASS))
#install.packages('MASS')
#install.packages('ggfortify')
suppressPackageStartupMessages(library(ggfortify))
#install.packages('broom')
suppressPackageStartupMessages(library(broom))
#install.packages('jtools')
#install.packages('huxtable')
suppressPackageStartupMessages(library(huxtable))
suppressPackageStartupMessages(library(jtools))
#install.packages('cowplot')
suppressPackageStartupMessages(library(cowplot))
abalone <- readr::read_csv('C:/personal files/data analytics/docs/git proj/abalone age prediction/Data-Science-Project/abalone.csv', show_col_types = FALSE)
head(abalone)
sex | length | diameter | height | whole_wt | shucked_wt | viscera_wt | shell_wt | rings | age |
M | 0.455 | 0.365 | 0.095 | 0.514 | 0.224 | 0.101 | 0.15 | 15 | 16.5 |
M | 0.35 | 0.265 | 0.09 | 0.226 | 0.0995 | 0.0485 | 0.07 | 7 | 8.5 |
F | 0.53 | 0.42 | 0.135 | 0.677 | 0.257 | 0.141 | 0.21 | 9 | 10.5 |
M | 0.44 | 0.365 | 0.125 | 0.516 | 0.215 | 0.114 | 0.155 | 10 | 11.5 |
I | 0.33 | 0.255 | 0.08 | 0.205 | 0.0895 | 0.0395 | 0.055 | 7 | 8.5 |
I | 0.425 | 0.3 | 0.095 | 0.351 | 0.141 | 0.0775 | 0.12 | 8 | 9.5 |
str(abalone)
## spec_tbl_df [4,177 x 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ sex : chr [1:4177] "M" "M" "F" "M" ...
## $ length : num [1:4177] 0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
## $ diameter : num [1:4177] 0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
## $ height : num [1:4177] 0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
## $ whole_wt : num [1:4177] 0.514 0.226 0.677 0.516 0.205 ...
## $ shucked_wt: num [1:4177] 0.2245 0.0995 0.2565 0.2155 0.0895 ...
## $ viscera_wt: num [1:4177] 0.101 0.0485 0.1415 0.114 0.0395 ...
## $ shell_wt : num [1:4177] 0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
## $ rings : num [1:4177] 15 7 9 10 7 8 20 16 9 19 ...
## $ age : num [1:4177] 16.5 8.5 10.5 11.5 8.5 9.5 21.5 17.5 10.5 20.5 ...
## - attr(*, "spec")=
## .. cols(
## .. sex = col_character(),
## .. length = col_double(),
## .. diameter = col_double(),
## .. height = col_double(),
## .. whole_wt = col_double(),
## .. shucked_wt = col_double(),
## .. viscera_wt = col_double(),
## .. shell_wt = col_double(),
## .. rings = col_double(),
## .. age = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
dim(abalone)
## [1] 4177 10
##Convering sex variable to factor type
abalone$sex = as.factor(abalone$sex)
glimpse(abalone)
## Rows: 4,177
## Columns: 10
## $ sex <fct> M, M, F, M, I, I, F, F, M, F, F, M, M, F, F, M, I, F, M, M,~
## $ length <dbl> 0.455, 0.350, 0.530, 0.440, 0.330, 0.425, 0.530, 0.545, 0.4~
## $ diameter <dbl> 0.365, 0.265, 0.420, 0.365, 0.255, 0.300, 0.415, 0.425, 0.3~
## $ height <dbl> 0.095, 0.090, 0.135, 0.125, 0.080, 0.095, 0.150, 0.125, 0.1~
## $ whole_wt <dbl> 0.5140, 0.2255, 0.6770, 0.5160, 0.2050, 0.3515, 0.7775, 0.7~
## $ shucked_wt <dbl> 0.2245, 0.0995, 0.2565, 0.2155, 0.0895, 0.1410, 0.2370, 0.2~
## $ viscera_wt <dbl> 0.1010, 0.0485, 0.1415, 0.1140, 0.0395, 0.0775, 0.1415, 0.1~
## $ shell_wt <dbl> 0.150, 0.070, 0.210, 0.155, 0.055, 0.120, 0.330, 0.260, 0.1~
## $ rings <dbl> 15, 7, 9, 10, 7, 8, 20, 16, 9, 19, 14, 10, 11, 10, 10, 12, ~
## $ age <dbl> 16.5, 8.5, 10.5, 11.5, 8.5, 9.5, 21.5, 17.5, 10.5, 20.5, 15~
summary(abalone)
## sex length diameter height whole_wt
## F:1307 Min. :0.075 Min. :0.0550 Min. :0.0000 Min. :0.0020
## I:1342 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150 1st Qu.:0.4415
## M:1528 Median :0.545 Median :0.4250 Median :0.1400 Median :0.7995
## Mean :0.524 Mean :0.4079 Mean :0.1395 Mean :0.8287
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650 3rd Qu.:1.1530
## Max. :0.815 Max. :0.6500 Max. :1.1300 Max. :2.8255
## shucked_wt viscera_wt shell_wt rings
## Min. :0.0010 Min. :0.0005 Min. :0.0015 Min. : 1.000
## 1st Qu.:0.1860 1st Qu.:0.0935 1st Qu.:0.1300 1st Qu.: 8.000
## Median :0.3360 Median :0.1710 Median :0.2340 Median : 9.000
## Mean :0.3594 Mean :0.1806 Mean :0.2388 Mean : 9.934
## 3rd Qu.:0.5020 3rd Qu.:0.2530 3rd Qu.:0.3290 3rd Qu.:11.000
## Max. :1.4880 Max. :0.7600 Max. :1.0050 Max. :29.000
## age
## Min. : 2.50
## 1st Qu.: 9.50
## Median :10.50
## Mean :11.43
## 3rd Qu.:12.50
## Max. :30.50
# seperating categorical variable
abl_cat =abalone %>%
dplyr::select(sex) %>%
group_by(sex)
#analysing proportion of data on the basis of sex
sex_prop =abl_cat %>%
summarise(count_n=n())%>%
mutate(prop.= paste0(round(count_n/sum(count_n)*100 , 2) , "%" ))
print(sex_prop)
## # A tibble: 3 x 3
## sex count_n prop.
## <fct> <int> <chr>
## 1 F 1307 31.29%
## 2 I 1342 32.13%
## 3 M 1528 36.58%
# This is a chart, switch to the DataCamp editor to view and configure it.
missing_data= function(x) {
name_var= c()
missing_values = c()
for (i in 1: x ){
name_var[i]=(names(abalone[i]))
missing_values[i] =(sum(is.na(abalone[i])))
}
return(data.frame(name_var, missing_values ))
}
as.data.frame(lapply((ncol(abalone)) , missing_data))
name_var | missing_values |
sex | 0 |
length | 0 |
diameter | 0 |
height | 0 |
whole_wt | 0 |
shucked_wt | 0 |
viscera_wt | 0 |
shell_wt | 0 |
rings | 0 |
age | 0 |
dup_data= sum(duplicated(abalone))
paste0("There are " ,dup_data , " " , "full duplicates in the dataset")
## [1] "There are 0 full duplicates in the dataset"
ggplot(stack(abalone), aes(x = ind, y = values , color= ind) )+
geom_boxplot()+
labs(title ="Boxplot")
## Warning in stack.data.frame(abalone): non-vector columns will be ignored

##standarization of data.
std_num_abl= function(y) {
standarization =(y - mean(y)) / sd(y)
return(standarization)
}
abl_num=as.data.frame(lapply(abalone[, -1],std_num_abl ))
ggplot(stack(abl_num), aes(x = ind, y = values , color= ind) )+
geom_boxplot()+
labs(title ="Boxplot")

abl_sum=as.data.frame(sapply(abalone[,-1], summary))
abl_sum
length | diameter | height | whole_wt | shucked_wt | viscera_wt | shell_wt | rings | age |
0.075 | 0.055 | 0 | 0.002 | 0.001 | 0.0005 | 0.0015 | 1 | 2.5 |
0.45 | 0.35 | 0.115 | 0.442 | 0.186 | 0.0935 | 0.13 | 8 | 9.5 |
0.545 | 0.425 | 0.14 | 0.799 | 0.336 | 0.171 | 0.234 | 9 | 10.5 |
0.524 | 0.408 | 0.14 | 0.829 | 0.359 | 0.181 | 0.239 | 9.93 | 11.4 |
0.615 | 0.48 | 0.165 | 1.15 | 0.502 | 0.253 | 0.329 | 11 | 12.5 |
0.815 | 0.65 | 1.13 | 2.83 | 1.49 | 0.76 | 1 | 29 | 30.5 |
abl_skew = as.data.frame(sapply (abalone[,-1], skewness))
abl_kur = as.data.frame (sapply(abalone[,-1], kurtosis))
abl_var = as.data.frame(sapply(abalone[,-1] , var))
as.data.frame(cbind(var = abl_var, skew= abl_skew , abl_kur))
sapply(abalone[, -1], var) | sapply(abalone[, -1], skewness) | sapply(abalone[, -1], kurtosis) |
0.0144 | -0.64 | 3.06 |
0.00985 | -0.609 | 2.95 |
0.00175 | 3.13 | 78.9 |
0.24 | 0.531 | 2.97 |
0.0493 | 0.719 | 3.59 |
0.012 | 0.592 | 3.08 |
0.0194 | 0.621 | 3.53 |
10.4 | 1.11 | 5.33 |
10.4 | 1.11 | 5.33 |
#REMOVING OUTLIER
abl_numerical = as.data.frame(abalone[,-1])
v= list()
abl_outliers=for (i in 1:ncol(abl_num)){
name_out= (names(abl_numerical))[i]
abliout = boxplot.stats(abl_numerical[,i])$out
v[[(names(abl_numerical))[i]]]= boxplot.stats(abl_numerical[,i])$out
len_out = length(abliout)
print(c(name_out ,paste("number of outliers" , len_out)))
print(sort(abliout))
}
## [1] "length" "number of outliers 49"
## [1] 0.075 0.110 0.130 0.130 0.135 0.140 0.140 0.150 0.155 0.155 0.155 0.160
## [13] 0.160 0.160 0.160 0.165 0.165 0.165 0.165 0.165 0.170 0.170 0.170 0.175
## [25] 0.175 0.175 0.175 0.175 0.180 0.180 0.180 0.180 0.185 0.185 0.185 0.185
## [37] 0.190 0.190 0.190 0.190 0.195 0.195 0.195 0.200 0.200 0.200 0.200 0.200
## [49] 0.200
## [1] "diameter" "number of outliers 59"
## [1] 0.055 0.090 0.095 0.100 0.100 0.105 0.105 0.105 0.105 0.110 0.110 0.110
## [13] 0.110 0.115 0.115 0.120 0.120 0.120 0.120 0.120 0.125 0.125 0.125 0.125
## [25] 0.125 0.125 0.125 0.130 0.130 0.130 0.130 0.130 0.130 0.130 0.130 0.135
## [37] 0.135 0.135 0.135 0.135 0.140 0.140 0.140 0.145 0.145 0.145 0.145 0.145
## [49] 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150 0.150
## [1] "height" "number of outliers 29"
## [1] 0.000 0.000 0.010 0.015 0.015 0.020 0.020 0.025 0.025 0.025 0.025 0.025
## [13] 0.030 0.030 0.030 0.030 0.030 0.030 0.035 0.035 0.035 0.035 0.035 0.035
## [25] 0.250 0.250 0.250 0.515 1.130
## [1] "whole_wt" "number of outliers 30"
## [1] 2.2205 2.2260 2.2305 2.2355 2.2360 2.2385 2.2500 2.2550 2.2635 2.2695
## [11] 2.2730 2.3020 2.3235 2.3305 2.3330 2.3810 2.3810 2.3980 2.4925 2.4990
## [21] 2.5050 2.5085 2.5155 2.5260 2.5480 2.5500 2.5550 2.6570 2.7795 2.8255
## [1] "shucked_wt" "number of outliers 48"
## [1] 0.9815 0.9840 0.9895 0.9915 0.9925 0.9955 1.0050 1.0070 1.0120 1.0135
## [11] 1.0170 1.0260 1.0265 1.0300 1.0465 1.0465 1.0515 1.0615 1.0685 1.0705
## [21] 1.0715 1.0745 1.0815 1.0830 1.0950 1.1055 1.1075 1.1090 1.1155 1.1280
## [31] 1.1330 1.1335 1.1455 1.1455 1.1465 1.1495 1.1565 1.1705 1.1945 1.1965
## [41] 1.2320 1.2395 1.2395 1.2455 1.2530 1.3485 1.3510 1.4880
## [1] "viscera_wt" "number of outliers 26"
## [1] 0.4925 0.4985 0.5005 0.5090 0.5115 0.5120 0.5130 0.5145 0.5185 0.5190
## [11] 0.5195 0.5195 0.5225 0.5235 0.5250 0.5260 0.5265 0.5410 0.5410 0.5500
## [21] 0.5640 0.5745 0.5750 0.5900 0.6415 0.7600
## [1] "shell_wt" "number of outliers 35"
## [1] 0.6300 0.6350 0.6380 0.6420 0.6430 0.6460 0.6500 0.6550 0.6570 0.6585
## [11] 0.6600 0.6650 0.6650 0.6745 0.6750 0.6785 0.6850 0.6855 0.6900 0.7000
## [21] 0.7100 0.7100 0.7250 0.7250 0.7250 0.7260 0.7600 0.7800 0.7975 0.8150
## [31] 0.8500 0.8850 0.8850 0.8970 1.0050
## [1] "rings" "number of outliers 278"
## [1] 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 16 16 16 16 16 16 16 16
## [26] 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
## [51] 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
## [76] 16 16 16 16 16 16 16 16 16 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17
## [101] 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17
## [126] 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 18 18 18 18 18 18 18 18
## [151] 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18
## [176] 18 18 18 18 18 18 18 18 18 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19
## [201] 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 20 20 20 20 20 20 20 20 20
## [226] 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 21 21 21 21 21 21 21 21
## [251] 21 21 21 21 21 21 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23 24 24 25 26
## [276] 27 27 29
## [1] "age" "number of outliers 278"
## [1] 2.5 3.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5
## [16] 4.5 4.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
## [31] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
## [46] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
## [61] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
## [76] 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5 18.5 18.5 18.5 18.5 18.5 18.5
## [91] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5
## [106] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5
## [121] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5
## [136] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5
## [151] 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5
## [166] 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5
## [181] 19.5 19.5 19.5 19.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5
## [196] 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5
## [211] 20.5 20.5 20.5 20.5 20.5 20.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5
## [226] 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5
## [241] 21.5 21.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5
## [256] 22.5 23.5 23.5 23.5 23.5 23.5 23.5 24.5 24.5 24.5 24.5 24.5 24.5 24.5 24.5
## [271] 24.5 25.5 25.5 26.5 27.5 28.5 28.5 30.5
abl_num_noout= abalone %>% filter( !length %in% v$length , !diameter %in% v$diameter ,!height %in% v$height ,
!whole_wt %in% v$whole_wt , !shell_wt %in% v$shell_wt ,!shucked_wt %in% v$shucked_wt ,
!viscera_wt %in% v$viscera_wt, !rings %in% v$rings)
head(abl_num_noout)
sex | length | diameter | height | whole_wt | shucked_wt | viscera_wt | shell_wt | rings | age |
M | 0.455 | 0.365 | 0.095 | 0.514 | 0.224 | 0.101 | 0.15 | 15 | 16.5 |
M | 0.35 | 0.265 | 0.09 | 0.226 | 0.0995 | 0.0485 | 0.07 | 7 | 8.5 |
F | 0.53 | 0.42 | 0.135 | 0.677 | 0.257 | 0.141 | 0.21 | 9 | 10.5 |
M | 0.44 | 0.365 | 0.125 | 0.516 | 0.215 | 0.114 | 0.155 | 10 | 11.5 |
I | 0.33 | 0.255 | 0.08 | 0.205 | 0.0895 | 0.0395 | 0.055 | 7 | 8.5 |
I | 0.425 | 0.3 | 0.095 | 0.351 | 0.141 | 0.0775 | 0.12 | 8 | 9.5 |
paste("Total no of rows with outliers removed" ,dim(abl_numerical)[1]-dim(abl_num_noout)[1] , "hence dimension of cleaned dataset ", dim(abl_num_noout)[1] ,"x", dim(abl_num_noout)[2])
## [1] "Total no of rows with outliers removed 396 hence dimension of cleaned dataset 3781 x 10"
std_num_abl= function(y) {
standarization =(y - mean(y)) / sd(y)
return(standarization)
}
x= as.data.frame(lapply(abl_num_noout[,-1] ,std_num_abl ))
abl.cs = cbind(abl_num_noout[,1],x )
summary(abl.cs)
## sex length diameter height
## F:1160 Min. :-2.8340 Min. :-2.7093 Min. :-2.75671
## I:1261 1st Qu.:-0.6370 1st Qu.:-0.6501 1st Qu.:-0.77397
## M:1360 Median : 0.1252 Median : 0.1627 Median : 0.07578
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.7977 3rd Qu.: 0.7588 3rd Qu.: 0.78390
## Max. : 2.1428 Max. : 2.1135 Max. : 2.90826
## whole_wt shucked_wt viscera_wt shell_wt
## Min. :-1.68568 Min. :-1.6240 Min. :-1.71107 Min. :-1.73484
## 1st Qu.:-0.80756 1st Qu.:-0.8181 1st Qu.:-0.82282 1st Qu.:-0.82296
## Median :-0.05762 Median :-0.1007 Median :-0.09743 Median :-0.04949
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.73167 3rd Qu.: 0.7101 3rd Qu.: 0.69212 3rd Qu.: 0.71991
## Max. : 3.00287 Max. : 3.0098 Max. : 3.13973 Max. : 3.24792
## rings age
## Min. :-2.3303 Min. :-2.3303
## 1st Qu.:-0.6139 1st Qu.:-0.6139
## Median :-0.1848 Median :-0.1848
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6734 3rd Qu.: 0.6734
## Max. : 2.3899 Max. : 2.3899
GGally::ggpairs(abl.cs, aes( color = sex ,alpha = 0.8 ), title = "Pairs plot for abalone dataset" )+ ggplot2::theme_grey(base_size = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

abalone_corr = abl.cs[,c(-1, -9)]
corrplot(cor(abalone_corr) ,type = "lower" ,main="\nCorrelation matrix" , addCoef.col = 'red', number.cex=1.0 ,
, tl.srt = 45)

par(mfrow = c(2, 2 ))
age_wholewt = ggplot (abl.cs , aes(whole_wt , age , colour = sex))+
geom_point()+geom_smooth (method = "lm")+
labs(title = "Relation between age vs whole_wt")
age_shuckedwt = ggplot (abl.cs , aes(shucked_wt , age , colour = sex))+
geom_point()+geom_smooth (method = "lm")+
labs(title = "Relation between age vs shucked_wt")
age_shellwt = ggplot (abl.cs , aes(shell_wt , age , colour = sex))+
geom_point()+geom_smooth (method = "lm")+
labs(title = "Relation between age vs shell_wt")
plot_grid(age_wholewt , age_shellwt , age_shuckedwt , labels ="AUTO" )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

set.seed(90)
#Splitting data
abl.cs = abl.cs %>%
mutate (split = sample(c(0,1) , size = nrow(abl.cs) , replace =TRUE, prob = c(0.30 , 0.70 )))
str(abl.cs)
## 'data.frame': 3781 obs. of 11 variables:
## $ sex : Factor w/ 3 levels "F","I","M": 3 3 1 3 2 2 3 1 3 3 ...
## $ length : num -0.5922 -1.5338 0.0803 -0.7267 -1.7131 ...
## $ diameter : num -0.433 -1.517 0.163 -0.433 -1.626 ...
## $ height : num -1.1988 -1.3405 -0.0658 -0.3491 -1.6237 ...
## $ whole_wt : num -0.625 -1.274 -0.259 -0.621 -1.32 ...
## $ shucked_wt: num -0.604 -1.219 -0.447 -0.649 -1.268 ...
## $ viscera_wt: num -0.719 -1.237 -0.319 -0.591 -1.326 ...
## $ shell_wt : num -0.619 -1.271 -0.131 -0.579 -1.393 ...
## $ rings : num 2.39 -1.043 -0.185 0.244 -1.043 ...
## $ age : num 2.39 -1.043 -0.185 0.244 -1.043 ...
## $ split : num 1 0 1 0 1 1 1 1 0 1 ...
train = subset(abl.cs , split== 1)
test = subset (abl.cs , split == 0)
ggplot(abl.cs , aes(as.factor(split) , fill= sex ))+
geom_bar(start = "identity" ,position = "dodge" )+
theme_minimal()+
ggtitle("Splitting data into test and train")+
xlab("Split dataset")
## Warning: Ignoring unknown parameters: start

# The OLS we are aiming for is :age of abalone which is determined by the count of rings hence we can say that age is dependent on the no of rings which is 1.5 * rings . Hence we can excluse the age variable for now in the model and consider as ring as explanatory variable i.e Regressing Rings~ sex + length + diameter + height +whole_wt+ vicera_wt + shucked_wt + shell_wt
ols1 = lm(rings ~ sex + length , data = train)
ols2 = lm(rings ~ sex +length + diameter, data = train)
ols3 = lm(rings ~ sex + length + diameter + height, data = train)
ols4 = lm(rings ~ sex + length + diameter +height + whole_wt, data = train)
ols5 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt, data = train)
ols6 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt, data = train)
ols7 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt + viscera_wt , data = train)
ols8 = lm(rings ~ sex + length + diameter + height +whole_wt + shucked_wt +viscera_wt + shell_wt, data = train)
export_summs(ols1 , ols2 , ols3 , ols4 , ols5 , ols6 , ols7 , ols8)
| Model 1 | Model 2 | Model 3 | Model 4 | Model 5 | Model 6 | Model 7 | Model 8 |
(Intercept) | 0.18 *** | 0.16 *** | 0.13 *** | 0.14 *** | 0.09 *** | 0.09 *** | 0.10 *** | 0.10 *** |
| (0.03) | (0.03) | (0.03) | (0.03) | (0.03) | (0.03) | (0.03) | (0.03) |
sexI | -0.50 *** | -0.45 *** | -0.37 *** | -0.40 *** | -0.32 *** | -0.32 *** | -0.33 *** | -0.34 *** |
| (0.04) | (0.04) | (0.04) | (0.04) | (0.04) | (0.04) | (0.04) | (0.04) |
sexM | -0.05 | -0.04 | -0.02 | -0.01 | 0.03 | 0.03 | 0.03 | 0.03 |
| (0.04) | (0.04) | (0.04) | (0.04) | (0.03) | (0.03) | (0.03) | (0.03) |
length | 0.47 *** | -0.21 * | -0.33 *** | -0.20 * | -0.04 | -0.04 | -0.01 | 0.02 |
| (0.02) | (0.09) | (0.09) | (0.09) | (0.09) | (0.09) | (0.09) | (0.09) |
diameter | | 0.70 *** | 0.50 *** | 0.57 *** | 0.47 *** | 0.47 *** | 0.44 *** | 0.36 *** |
| | (0.09) | (0.09) | (0.09) | (0.09) | (0.09) | (0.09) | (0.09) |
height | | | 0.37 *** | 0.44 *** | 0.31 *** | 0.31 *** | 0.31 *** | 0.27 *** |
| | | (0.04) | (0.04) | (0.03) | (0.03) | (0.03) | (0.04) |
whole_wt | | | | -0.28 *** | 1.01 *** | 1.01 *** | 1.24 *** | 0.68 *** |
| | | | (0.05) | (0.08) | (0.08) | (0.10) | (0.13) |
shucked_wt | | | | | -1.25 *** | -1.25 *** | -1.29 *** | -1.08 *** |
| | | | | (0.06) | (0.06) | (0.06) | (0.07) |
viscera_wt | | | | | | | -0.21 *** | -0.13 * |
| | | | | | | (0.05) | (0.05) |
shell_wt | | | | | | | | 0.37 *** |
| | | | | | | | (0.06) |
N | 2661 | 2661 | 2661 | 2661 | 2661 | 2661 | 2661 | 2661 |
R2 | 0.38 | 0.39 | 0.42 | 0.42 | 0.50 | 0.50 | 0.51 | 0.51 |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
sum_model =export_summs(ols1 , ols2 , ols3 , ols4 , ols5 , ols6 , ols7 , ols8)
R =sum_model %>% filter(names ==c("R2" ) )
Rsq=as.numeric(R)
## Warning: NAs introduced by coercion
Rsq=na.omit(as.numeric(R))
## Warning in na.omit(as.numeric(R)): NAs introduced by coercion
sum_model_col=colnames(sum_model)
r2_vs_model=data.frame(x1=sum_model_col[-1] , y1 =Rsq)
r2_vs_model
x1 | y1 |
Model 1 | 0.379 |
Model 2 | 0.392 |
Model 3 | 0.416 |
Model 4 | 0.424 |
Model 5 | 0.502 |
Model 6 | 0.502 |
Model 7 | 0.505 |
Model 8 | 0.512 |
#viz
ggplot(r2_vs_model , aes (x1 , y1, colour= "red" , group = 1))+geom_point() +geom_line() +labs(title =" Rsq vs ADDITIVE MODELS" ,x = "Models" , y= "R^2")

## lets see a summary of selected model
summary(ols8)
##
## Call:
## lm(formula = rings ~ sex + length + diameter + height + whole_wt +
## shucked_wt + viscera_wt + shell_wt, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78920 -0.47274 -0.09323 0.36280 2.99538
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09893 0.02587 3.824 0.000134 ***
## sexI -0.33772 0.04067 -8.305 < 2e-16 ***
## sexM 0.02968 0.03357 0.884 0.376739
## length 0.01643 0.08582 0.191 0.848190
## diameter 0.36022 0.08665 4.157 3.32e-05 ***
## height 0.27265 0.03539 7.705 1.84e-14 ***
## whole_wt 0.68289 0.13447 5.079 4.07e-07 ***
## shucked_wt -1.07726 0.07153 -15.060 < 2e-16 ***
## viscera_wt -0.12958 0.05450 -2.378 0.017494 *
## shell_wt 0.37409 0.06275 5.962 2.82e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7001 on 2651 degrees of freedom
## Multiple R-squared: 0.5116, Adjusted R-squared: 0.5099
## F-statistic: 308.5 on 9 and 2651 DF, p-value: < 2.2e-16
#VIF (variance of inflation) test
as.matrix(faraway::vif(ols8))
## [,1]
## sexI 1.991842
## sexM 1.407564
## length 39.153577
## diameter 39.936756
## height 6.613850
## whole_wt 95.417651
## shucked_wt 26.854677
## viscera_wt 15.718910
## shell_wt 20.879258
#check for variablity in highly correlated variable .
ols_correlations(ols8)
Zero-order | Partial | Part |
-0.472 | -0.159 | -0.113 |
0.206 | 0.0172 | 0.012 |
0.586 | 0.00372 | 0.0026 |
0.602 | 0.0805 | 0.0564 |
0.618 | 0.148 | 0.105 |
0.56 | 0.0982 | 0.0689 |
0.467 | -0.281 | -0.204 |
0.546 | -0.0461 | -0.0323 |
0.623 | 0.115 | 0.0809 |
##model without whole_wt variable
ols8_model1 =lm(rings ~ sex + length + diameter + height + shucked_wt +viscera_wt + shell_wt, data = train)
summary(ols8_model1)
##
## Call:
## lm(formula = rings ~ sex + length + diameter + height + shucked_wt +
## viscera_wt + shell_wt, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.73098 -0.47329 -0.08827 0.36729 3.00320
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.10440 0.02597 4.020 5.98e-05 ***
## sexI -0.35081 0.04077 -8.604 < 2e-16 ***
## sexM 0.02578 0.03371 0.765 0.445
## length 0.02751 0.08619 0.319 0.750
## diameter 0.36192 0.08705 4.158 3.32e-05 ***
## height 0.27593 0.03555 7.763 1.18e-14 ***
## shucked_wt -0.78225 0.04194 -18.652 < 2e-16 ***
## viscera_wt 0.03362 0.04422 0.760 0.447
## shell_wt 0.59630 0.04518 13.198 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7033 on 2652 degrees of freedom
## Multiple R-squared: 0.5068, Adjusted R-squared: 0.5053
## F-statistic: 340.7 on 8 and 2652 DF, p-value: < 2.2e-16
####Variance inflation factor of the additive model without the Whole_weight
as.matrix(faraway::vif(ols8_model1))
## [,1]
## sexI 1.983840
## sexM 1.406829
## length 39.128250
## diameter 39.936157
## height 6.611648
## shucked_wt 9.145049
## viscera_wt 10.253948
## shell_wt 10.725643
ols_correlations(ols8_model1)
Zero-order | Partial | Part |
-0.472 | -0.165 | -0.117 |
0.206 | 0.0148 | 0.0104 |
0.586 | 0.0062 | 0.00435 |
0.602 | 0.0805 | 0.0567 |
0.618 | 0.149 | 0.106 |
0.467 | -0.341 | -0.254 |
0.546 | 0.0148 | 0.0104 |
0.623 | 0.248 | 0.18 |
##model without whoe_wt and diameter variable
ols8_model2 =lm(rings ~ sex + length +height + shucked_wt +viscera_wt + shell_wt, data = train)
####Variance inflation factor of the additive model without the Whole_weight and diameter
as.matrix(faraway::vif(ols8_model2))
## [,1]
## sexI 1.961260
## sexM 1.405048
## length 10.215753
## height 6.530751
## shucked_wt 9.137632
## viscera_wt 10.212159
## shell_wt 10.252933
summary(ols8_model2)
##
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + viscera_wt +
## shell_wt, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7335 -0.4753 -0.1042 0.3650 3.0309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.11230 0.02598 4.323 1.60e-05 ***
## sexI -0.36889 0.04066 -9.072 < 2e-16 ***
## sexM 0.02079 0.03380 0.615 0.538
## length 0.33555 0.04417 7.596 4.21e-14 ***
## height 0.29228 0.03544 8.248 2.51e-16 ***
## shucked_wt -0.77729 0.04205 -18.485 < 2e-16 ***
## viscera_wt 0.02188 0.04427 0.494 0.621
## shell_wt 0.63574 0.04431 14.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7055 on 2653 degrees of freedom
## Multiple R-squared: 0.5036, Adjusted R-squared: 0.5023
## F-statistic: 384.5 on 7 and 2653 DF, p-value: < 2.2e-16
anova(ols8_model2 , ols8)
Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
2.65e+03 | 1.32e+03 | | | | |
2.65e+03 | 1.3e+03 | 2 | 21.2 | 21.6 | 4.86e-10 |
summary (ols8_model2)
##
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + viscera_wt +
## shell_wt, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7335 -0.4753 -0.1042 0.3650 3.0309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.11230 0.02598 4.323 1.60e-05 ***
## sexI -0.36889 0.04066 -9.072 < 2e-16 ***
## sexM 0.02079 0.03380 0.615 0.538
## length 0.33555 0.04417 7.596 4.21e-14 ***
## height 0.29228 0.03544 8.248 2.51e-16 ***
## shucked_wt -0.77729 0.04205 -18.485 < 2e-16 ***
## viscera_wt 0.02188 0.04427 0.494 0.621
## shell_wt 0.63574 0.04431 14.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7055 on 2653 degrees of freedom
## Multiple R-squared: 0.5036, Adjusted R-squared: 0.5023
## F-statistic: 384.5 on 7 and 2653 DF, p-value: < 2.2e-16
##After removing significant multicoliniearity we can vicera_wt becomes insignificant and hence can be removed
ols8_model3 =lm(rings ~ sex + length +height + shucked_wt + shell_wt, data = train)
summary(ols8_model3)
##
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + shell_wt,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7481 -0.4729 -0.1040 0.3689 3.0339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.11314 0.02592 4.365 1.32e-05 ***
## sexI -0.37097 0.04044 -9.173 < 2e-16 ***
## sexM 0.02038 0.03378 0.603 0.546
## length 0.33836 0.04380 7.725 1.58e-14 ***
## height 0.29458 0.03512 8.387 < 2e-16 ***
## shucked_wt -0.76771 0.03731 -20.576 < 2e-16 ***
## shell_wt 0.64207 0.04241 15.139 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7054 on 2654 degrees of freedom
## Multiple R-squared: 0.5036, Adjusted R-squared: 0.5024
## F-statistic: 448.7 on 6 and 2654 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(ols8_model3)

library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
lmtest::dwtest(ols8_model3)
##
## Durbin-Watson test
##
## data: ols8_model3
## DW = 1.4215, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
## Detecting leverages and influencers
autoplot(ols8_model3, which =4:6, nrow =3, ncol =1
)

names(summary(ols8_model3))
## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled"
ols8_model3 %>%
augment() %>%
dplyr::select(rings , .hat , .cooksd) %>%
arrange(desc(.cooksd)) %>%
head()
rings | .hat | .cooksd |
-1.47 | 0.00692 | 0.0152 |
-0.614 | 0.00891 | 0.0148 |
-1.9 | 0.0215 | 0.00987 |
1.53 | 0.00846 | 0.00889 |
1.1 | 0.0182 | 0.00837 |
2.39 | 0.00409 | 0.00674 |
## removing the most influencial data
train_1= train %>%
filter( rings != c(-0.6139 ,-1.4721 , 1.1025 ))
ols8_model4 =lm(rings ~ sex + length +height + shucked_wt + shell_wt, data = train_1)
summary(ols8_model4)
##
## Call:
## lm(formula = rings ~ sex + length + height + shucked_wt + shell_wt,
## data = train_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7481 -0.4729 -0.1040 0.3689 3.0339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.11314 0.02592 4.365 1.32e-05 ***
## sexI -0.37097 0.04044 -9.173 < 2e-16 ***
## sexM 0.02038 0.03378 0.603 0.546
## length 0.33836 0.04380 7.725 1.58e-14 ***
## height 0.29458 0.03512 8.387 < 2e-16 ***
## shucked_wt -0.76771 0.03731 -20.576 < 2e-16 ***
## shell_wt 0.64207 0.04241 15.139 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7054 on 2654 degrees of freedom
## Multiple R-squared: 0.5036, Adjusted R-squared: 0.5024
## F-statistic: 448.7 on 6 and 2654 DF, p-value: < 2.2e-16
## Drop the variables from the test dataset as per the train model.
test_1 = test %>% dplyr::select(sex ,length ,height , shucked_wt , shell_wt, rings)
train_1 = train %>% dplyr:: select(sex ,length ,height , shucked_wt , shell_wt, rings)
set.seed(900)
## Take random observation fro test data
test_5 =sample_n(test_1 , 10)
## predicting avalone rings
no.rings_pred. = predict(ols8_model4 , test_5 , interval = "prediction")
no.rings_obs. = abl.cs$rings
## Destandarization of ring variable
targetmean = mean(abl_num_noout$rings)
targetsd = sd(abl_num_noout$rings)
unscaledtest.obs = round(no.rings_obs. *targetsd + targetmean , 0)
unscaledtest.pred = round (no.rings_pred. *targetsd + targetmean , 0)
obs.age = unscaledtest.obs * 1.5
pred.age = unscaledtest.pred * 1.5
cbind(observed_no.rings = unscaledtest.obs ,predicted =unscaledtest.pred ,observed_age = obs.age , pred.age ) %>% head(10)
## Warning in cbind(observed_no.rings = unscaledtest.obs, predicted =
## unscaledtest.pred, : number of rows of result is not a multiple of vector length
## (arg 1)
## observed_no.rings fit lwr upr observed_age fit lwr upr
## 1 15 10 7 13 22.5 15.0 10.5 19.5
## 2 7 10 6 13 10.5 15.0 9.0 19.5
## 3 9 11 8 14 13.5 16.5 12.0 21.0
## 4 10 9 5 12 15.0 13.5 7.5 18.0
## 5 7 12 8 15 10.5 18.0 12.0 22.5
## 6 8 8 4 11 12.0 12.0 6.0 16.5
## 7 9 9 6 13 13.5 13.5 9.0 19.5
## 8 14 12 8 15 21.0 18.0 12.0 22.5
## 9 10 10 7 13 15.0 15.0 10.5 19.5
## 10 11 8 4 11 16.5 12.0 6.0 16.5
#exp(predict(ols8_model3, newdata=new_data, interval="confidence"))
rmse_trainols8_model=sqrt(mean((train$rings - predict(ols8 , train))^2))
rmse_testols8_model = sqrt(mean((test$rings - predict(ols8 , test))^2))
rmse_trainols8_model4=sqrt(mean((train_1$rings - predict(ols8_model4 , train_1))^2))
rmse_testols8_model4 = sqrt(mean((test_1$rings - predict(ols8_model4 , test_1))^2))
data.frame(model =c("ols8","ols8_model4") ,rmse_train = c(rmse_trainols8_model ,rmse_trainols8_model4) , rmse_test =c(rmse_testols8_model ,rmse_trainols8_model4 ))
model | rmse_train | rmse_test |
ols8 | 0.699 | 0.707 |
ols8_model4 | 0.704 | 0.704 |