Breast Cancer Prediction
Sachin Sharma
10/17/2021
Installing required package
Importing Libraries
library(devtools)
## Loading required package: usethis
library(readr)
library(knitr)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(naniar)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.4 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggcorrplot) # finding the correlation with variables
library(caTools)# splitting data into training set test set
Importing Data
data_cancer <- read.csv("breastcancer.csv")
head(data_cancer)
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 842302 M 17.99 10.38 122.80 1001.0
## 2 842517 M 20.57 17.77 132.90 1326.0
## 3 84300903 M 19.69 21.25 130.00 1203.0
## 4 84348301 M 11.42 20.38 77.58 386.1
## 5 84358402 M 20.29 14.34 135.10 1297.0
## 6 843786 M 12.45 15.70 82.57 477.1
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1 0.11840 0.27760 0.3001 0.14710
## 2 0.08474 0.07864 0.0869 0.07017
## 3 0.10960 0.15990 0.1974 0.12790
## 4 0.14250 0.28390 0.2414 0.10520
## 5 0.10030 0.13280 0.1980 0.10430
## 6 0.12780 0.17000 0.1578 0.08089
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.2419 0.07871 1.0950 0.9053 8.589
## 2 0.1812 0.05667 0.5435 0.7339 3.398
## 3 0.2069 0.05999 0.7456 0.7869 4.585
## 4 0.2597 0.09744 0.4956 1.1560 3.445
## 5 0.1809 0.05883 0.7572 0.7813 5.438
## 6 0.2087 0.07613 0.3345 0.8902 2.217
## area_se smoothness_se compactness_se concavity_se concave.points_se
## 1 153.40 0.006399 0.04904 0.05373 0.01587
## 2 74.08 0.005225 0.01308 0.01860 0.01340
## 3 94.03 0.006150 0.04006 0.03832 0.02058
## 4 27.23 0.009110 0.07458 0.05661 0.01867
## 5 94.44 0.011490 0.02461 0.05688 0.01885
## 6 27.19 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1 0.03003 0.006193 25.38 17.33 184.60
## 2 0.01389 0.003532 24.99 23.41 158.80
## 3 0.02250 0.004571 23.57 25.53 152.50
## 4 0.05963 0.009208 14.91 26.50 98.87
## 5 0.01756 0.005115 22.54 16.67 152.20
## 6 0.02165 0.005082 15.47 23.75 103.40
## area_worst smoothness_worst compactness_worst concavity_worst
## 1 2019.0 0.1622 0.6656 0.7119
## 2 1956.0 0.1238 0.1866 0.2416
## 3 1709.0 0.1444 0.4245 0.4504
## 4 567.7 0.2098 0.8663 0.6869
## 5 1575.0 0.1374 0.2050 0.4000
## 6 741.6 0.1791 0.5249 0.5355
## concave.points_worst symmetry_worst fractal_dimension_worst
## 1 0.2654 0.4601 0.11890
## 2 0.1860 0.2750 0.08902
## 3 0.2430 0.3613 0.08758
## 4 0.2575 0.6638 0.17300
## 5 0.1625 0.2364 0.07678
## 6 0.1741 0.3985 0.12440
str(data_cancer)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : chr "M" "M" "M" "M" ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
To visualize all the variable in the data frame
data_1 <- data_cancer %>%
as.data.frame() %>%
select_if(is.numeric) %>%
gather(key = "variable", value = "value")
ggplot(data_1, aes(value)) +
geom_density() +
facet_wrap(~variable)
# This visualization reprsent which data require feature scaling : concave points, concave points, fractal dimensiona, #smoothness se,
We have all the data in the numeric form, except diagnosis which is M and B
Lets convert this into numeric only
data_cancer$diagnosis <- factor(data_cancer$diagnosis, levels = c("M","B"), labels = c(0,1))
now converting facrtors to character and then character to numeric, if we convert this directly to numeric it will
give errors
data_cancer$diagnosis <- as.character(data_cancer$diagnosis)
data_cancer$diagnosis <- as.numeric(data_cancer$diagnosis)
str(data_cancer)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : num 0 0 0 0 0 0 0 0 0 0 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
view(data_cancer)
Changing the postiion of dependent variable ie. diagnosis to the extreme right of the data to avoid confusion
We will use this by uisng tidyverse function relocate() , .after(), .before() these are very handy function while changing
the position of the columns . Here we need to shift diagnosis column after fractal_dimension_worst
data_cancer <- data_cancer %>% relocate(diagnosis,.after= fractal_dimension_worst)
str(data_cancer)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ diagnosis : num 0 0 0 0 0 0 0 0 0 0 ...
str(data_cancer$diagnosis)
## num [1:569] 0 0 0 0 0 0 0 0 0 0 ...
data_cancer$diagnosis
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
## [75] 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
## [112] 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
## [149] 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
## [186] 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
## [223] 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
## [334] 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
## [371] 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
## [408] 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
## [445] 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
## [482] 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0
## [519] 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [556] 1 1 1 1 1 1 1 0 0 0 0 0 0 1
Visualising the correlation between datasets
r <- cor(data_cancer, use="complete.obs")
round(r,2)
## id radius_mean texture_mean perimeter_mean area_mean
## id 1.00 0.07 0.10 0.07 0.10
## radius_mean 0.07 1.00 0.32 1.00 0.99
## texture_mean 0.10 0.32 1.00 0.33 0.32
## perimeter_mean 0.07 1.00 0.33 1.00 0.99
## area_mean 0.10 0.99 0.32 0.99 1.00
## smoothness_mean -0.01 0.17 -0.02 0.21 0.18
## compactness_mean 0.00 0.51 0.24 0.56 0.50
## concavity_mean 0.05 0.68 0.30 0.72 0.69
## concave.points_mean 0.04 0.82 0.29 0.85 0.82
## symmetry_mean -0.02 0.15 0.07 0.18 0.15
## fractal_dimension_mean -0.05 -0.31 -0.08 -0.26 -0.28
## radius_se 0.14 0.68 0.28 0.69 0.73
## texture_se -0.01 -0.10 0.39 -0.09 -0.07
## perimeter_se 0.14 0.67 0.28 0.69 0.73
## area_se 0.18 0.74 0.26 0.74 0.80
## smoothness_se 0.10 -0.22 0.01 -0.20 -0.17
## compactness_se 0.03 0.21 0.19 0.25 0.21
## concavity_se 0.06 0.19 0.14 0.23 0.21
## concave.points_se 0.08 0.38 0.16 0.41 0.37
## symmetry_se -0.02 -0.10 0.01 -0.08 -0.07
## fractal_dimension_se 0.03 -0.04 0.05 -0.01 -0.02
## radius_worst 0.08 0.97 0.35 0.97 0.96
## texture_worst 0.06 0.30 0.91 0.30 0.29
## perimeter_worst 0.08 0.97 0.36 0.97 0.96
## area_worst 0.11 0.94 0.34 0.94 0.96
## smoothness_worst 0.01 0.12 0.08 0.15 0.12
## compactness_worst 0.00 0.41 0.28 0.46 0.39
## concavity_worst 0.02 0.53 0.30 0.56 0.51
## concave.points_worst 0.04 0.74 0.30 0.77 0.72
## symmetry_worst -0.04 0.16 0.11 0.19 0.14
## fractal_dimension_worst -0.03 0.01 0.12 0.05 0.00
## diagnosis -0.04 -0.73 -0.42 -0.74 -0.71
## smoothness_mean compactness_mean concavity_mean
## id -0.01 0.00 0.05
## radius_mean 0.17 0.51 0.68
## texture_mean -0.02 0.24 0.30
## perimeter_mean 0.21 0.56 0.72
## area_mean 0.18 0.50 0.69
## smoothness_mean 1.00 0.66 0.52
## compactness_mean 0.66 1.00 0.88
## concavity_mean 0.52 0.88 1.00
## concave.points_mean 0.55 0.83 0.92
## symmetry_mean 0.56 0.60 0.50
## fractal_dimension_mean 0.58 0.57 0.34
## radius_se 0.30 0.50 0.63
## texture_se 0.07 0.05 0.08
## perimeter_se 0.30 0.55 0.66
## area_se 0.25 0.46 0.62
## smoothness_se 0.33 0.14 0.10
## compactness_se 0.32 0.74 0.67
## concavity_se 0.25 0.57 0.69
## concave.points_se 0.38 0.64 0.68
## symmetry_se 0.20 0.23 0.18
## fractal_dimension_se 0.28 0.51 0.45
## radius_worst 0.21 0.54 0.69
## texture_worst 0.04 0.25 0.30
## perimeter_worst 0.24 0.59 0.73
## area_worst 0.21 0.51 0.68
## smoothness_worst 0.81 0.57 0.45
## compactness_worst 0.47 0.87 0.75
## concavity_worst 0.43 0.82 0.88
## concave.points_worst 0.50 0.82 0.86
## symmetry_worst 0.39 0.51 0.41
## fractal_dimension_worst 0.50 0.69 0.51
## diagnosis -0.36 -0.60 -0.70
## concave.points_mean symmetry_mean
## id 0.04 -0.02
## radius_mean 0.82 0.15
## texture_mean 0.29 0.07
## perimeter_mean 0.85 0.18
## area_mean 0.82 0.15
## smoothness_mean 0.55 0.56
## compactness_mean 0.83 0.60
## concavity_mean 0.92 0.50
## concave.points_mean 1.00 0.46
## symmetry_mean 0.46 1.00
## fractal_dimension_mean 0.17 0.48
## radius_se 0.70 0.30
## texture_se 0.02 0.13
## perimeter_se 0.71 0.31
## area_se 0.69 0.22
## smoothness_se 0.03 0.19
## compactness_se 0.49 0.42
## concavity_se 0.44 0.34
## concave.points_se 0.62 0.39
## symmetry_se 0.10 0.45
## fractal_dimension_se 0.26 0.33
## radius_worst 0.83 0.19
## texture_worst 0.29 0.09
## perimeter_worst 0.86 0.22
## area_worst 0.81 0.18
## smoothness_worst 0.45 0.43
## compactness_worst 0.67 0.47
## concavity_worst 0.75 0.43
## concave.points_worst 0.91 0.43
## symmetry_worst 0.38 0.70
## fractal_dimension_worst 0.37 0.44
## diagnosis -0.78 -0.33
## fractal_dimension_mean radius_se texture_se
## id -0.05 0.14 -0.01
## radius_mean -0.31 0.68 -0.10
## texture_mean -0.08 0.28 0.39
## perimeter_mean -0.26 0.69 -0.09
## area_mean -0.28 0.73 -0.07
## smoothness_mean 0.58 0.30 0.07
## compactness_mean 0.57 0.50 0.05
## concavity_mean 0.34 0.63 0.08
## concave.points_mean 0.17 0.70 0.02
## symmetry_mean 0.48 0.30 0.13
## fractal_dimension_mean 1.00 0.00 0.16
## radius_se 0.00 1.00 0.21
## texture_se 0.16 0.21 1.00
## perimeter_se 0.04 0.97 0.22
## area_se -0.09 0.95 0.11
## smoothness_se 0.40 0.16 0.40
## compactness_se 0.56 0.36 0.23
## concavity_se 0.45 0.33 0.19
## concave.points_se 0.34 0.51 0.23
## symmetry_se 0.35 0.24 0.41
## fractal_dimension_se 0.69 0.23 0.28
## radius_worst -0.25 0.72 -0.11
## texture_worst -0.05 0.19 0.41
## perimeter_worst -0.21 0.72 -0.10
## area_worst -0.23 0.75 -0.08
## smoothness_worst 0.50 0.14 -0.07
## compactness_worst 0.46 0.29 -0.09
## concavity_worst 0.35 0.38 -0.07
## concave.points_worst 0.18 0.53 -0.12
## symmetry_worst 0.33 0.09 -0.13
## fractal_dimension_worst 0.77 0.05 -0.05
## diagnosis 0.01 -0.57 0.01
## perimeter_se area_se smoothness_se compactness_se
## id 0.14 0.18 0.10 0.03
## radius_mean 0.67 0.74 -0.22 0.21
## texture_mean 0.28 0.26 0.01 0.19
## perimeter_mean 0.69 0.74 -0.20 0.25
## area_mean 0.73 0.80 -0.17 0.21
## smoothness_mean 0.30 0.25 0.33 0.32
## compactness_mean 0.55 0.46 0.14 0.74
## concavity_mean 0.66 0.62 0.10 0.67
## concave.points_mean 0.71 0.69 0.03 0.49
## symmetry_mean 0.31 0.22 0.19 0.42
## fractal_dimension_mean 0.04 -0.09 0.40 0.56
## radius_se 0.97 0.95 0.16 0.36
## texture_se 0.22 0.11 0.40 0.23
## perimeter_se 1.00 0.94 0.15 0.42
## area_se 0.94 1.00 0.08 0.28
## smoothness_se 0.15 0.08 1.00 0.34
## compactness_se 0.42 0.28 0.34 1.00
## concavity_se 0.36 0.27 0.27 0.80
## concave.points_se 0.56 0.42 0.33 0.74
## symmetry_se 0.27 0.13 0.41 0.39
## fractal_dimension_se 0.24 0.13 0.43 0.80
## radius_worst 0.70 0.76 -0.23 0.20
## texture_worst 0.20 0.20 -0.07 0.14
## perimeter_worst 0.72 0.76 -0.22 0.26
## area_worst 0.73 0.81 -0.18 0.20
## smoothness_worst 0.13 0.13 0.31 0.23
## compactness_worst 0.34 0.28 -0.06 0.68
## concavity_worst 0.42 0.39 -0.06 0.64
## concave.points_worst 0.55 0.54 -0.10 0.48
## symmetry_worst 0.11 0.07 -0.11 0.28
## fractal_dimension_worst 0.09 0.02 0.10 0.59
## diagnosis -0.56 -0.55 0.07 -0.29
## concavity_se concave.points_se symmetry_se
## id 0.06 0.08 -0.02
## radius_mean 0.19 0.38 -0.10
## texture_mean 0.14 0.16 0.01
## perimeter_mean 0.23 0.41 -0.08
## area_mean 0.21 0.37 -0.07
## smoothness_mean 0.25 0.38 0.20
## compactness_mean 0.57 0.64 0.23
## concavity_mean 0.69 0.68 0.18
## concave.points_mean 0.44 0.62 0.10
## symmetry_mean 0.34 0.39 0.45
## fractal_dimension_mean 0.45 0.34 0.35
## radius_se 0.33 0.51 0.24
## texture_se 0.19 0.23 0.41
## perimeter_se 0.36 0.56 0.27
## area_se 0.27 0.42 0.13
## smoothness_se 0.27 0.33 0.41
## compactness_se 0.80 0.74 0.39
## concavity_se 1.00 0.77 0.31
## concave.points_se 0.77 1.00 0.31
## symmetry_se 0.31 0.31 1.00
## fractal_dimension_se 0.73 0.61 0.37
## radius_worst 0.19 0.36 -0.13
## texture_worst 0.10 0.09 -0.08
## perimeter_worst 0.23 0.39 -0.10
## area_worst 0.19 0.34 -0.11
## smoothness_worst 0.17 0.22 -0.01
## compactness_worst 0.48 0.45 0.06
## concavity_worst 0.66 0.55 0.04
## concave.points_worst 0.44 0.60 -0.03
## symmetry_worst 0.20 0.14 0.39
## fractal_dimension_worst 0.44 0.31 0.08
## diagnosis -0.25 -0.41 0.01
## fractal_dimension_se radius_worst texture_worst
## id 0.03 0.08 0.06
## radius_mean -0.04 0.97 0.30
## texture_mean 0.05 0.35 0.91
## perimeter_mean -0.01 0.97 0.30
## area_mean -0.02 0.96 0.29
## smoothness_mean 0.28 0.21 0.04
## compactness_mean 0.51 0.54 0.25
## concavity_mean 0.45 0.69 0.30
## concave.points_mean 0.26 0.83 0.29
## symmetry_mean 0.33 0.19 0.09
## fractal_dimension_mean 0.69 -0.25 -0.05
## radius_se 0.23 0.72 0.19
## texture_se 0.28 -0.11 0.41
## perimeter_se 0.24 0.70 0.20
## area_se 0.13 0.76 0.20
## smoothness_se 0.43 -0.23 -0.07
## compactness_se 0.80 0.20 0.14
## concavity_se 0.73 0.19 0.10
## concave.points_se 0.61 0.36 0.09
## symmetry_se 0.37 -0.13 -0.08
## fractal_dimension_se 1.00 -0.04 0.00
## radius_worst -0.04 1.00 0.36
## texture_worst 0.00 0.36 1.00
## perimeter_worst 0.00 0.99 0.37
## area_worst -0.02 0.98 0.35
## smoothness_worst 0.17 0.22 0.23
## compactness_worst 0.39 0.48 0.36
## concavity_worst 0.38 0.57 0.37
## concave.points_worst 0.22 0.79 0.36
## symmetry_worst 0.11 0.24 0.23
## fractal_dimension_worst 0.59 0.09 0.22
## diagnosis -0.08 -0.78 -0.46
## perimeter_worst area_worst smoothness_worst
## id 0.08 0.11 0.01
## radius_mean 0.97 0.94 0.12
## texture_mean 0.36 0.34 0.08
## perimeter_mean 0.97 0.94 0.15
## area_mean 0.96 0.96 0.12
## smoothness_mean 0.24 0.21 0.81
## compactness_mean 0.59 0.51 0.57
## concavity_mean 0.73 0.68 0.45
## concave.points_mean 0.86 0.81 0.45
## symmetry_mean 0.22 0.18 0.43
## fractal_dimension_mean -0.21 -0.23 0.50
## radius_se 0.72 0.75 0.14
## texture_se -0.10 -0.08 -0.07
## perimeter_se 0.72 0.73 0.13
## area_se 0.76 0.81 0.13
## smoothness_se -0.22 -0.18 0.31
## compactness_se 0.26 0.20 0.23
## concavity_se 0.23 0.19 0.17
## concave.points_se 0.39 0.34 0.22
## symmetry_se -0.10 -0.11 -0.01
## fractal_dimension_se 0.00 -0.02 0.17
## radius_worst 0.99 0.98 0.22
## texture_worst 0.37 0.35 0.23
## perimeter_worst 1.00 0.98 0.24
## area_worst 0.98 1.00 0.21
## smoothness_worst 0.24 0.21 1.00
## compactness_worst 0.53 0.44 0.57
## concavity_worst 0.62 0.54 0.52
## concave.points_worst 0.82 0.75 0.55
## symmetry_worst 0.27 0.21 0.49
## fractal_dimension_worst 0.14 0.08 0.62
## diagnosis -0.78 -0.73 -0.42
## compactness_worst concavity_worst concave.points_worst
## id 0.00 0.02 0.04
## radius_mean 0.41 0.53 0.74
## texture_mean 0.28 0.30 0.30
## perimeter_mean 0.46 0.56 0.77
## area_mean 0.39 0.51 0.72
## smoothness_mean 0.47 0.43 0.50
## compactness_mean 0.87 0.82 0.82
## concavity_mean 0.75 0.88 0.86
## concave.points_mean 0.67 0.75 0.91
## symmetry_mean 0.47 0.43 0.43
## fractal_dimension_mean 0.46 0.35 0.18
## radius_se 0.29 0.38 0.53
## texture_se -0.09 -0.07 -0.12
## perimeter_se 0.34 0.42 0.55
## area_se 0.28 0.39 0.54
## smoothness_se -0.06 -0.06 -0.10
## compactness_se 0.68 0.64 0.48
## concavity_se 0.48 0.66 0.44
## concave.points_se 0.45 0.55 0.60
## symmetry_se 0.06 0.04 -0.03
## fractal_dimension_se 0.39 0.38 0.22
## radius_worst 0.48 0.57 0.79
## texture_worst 0.36 0.37 0.36
## perimeter_worst 0.53 0.62 0.82
## area_worst 0.44 0.54 0.75
## smoothness_worst 0.57 0.52 0.55
## compactness_worst 1.00 0.89 0.80
## concavity_worst 0.89 1.00 0.86
## concave.points_worst 0.80 0.86 1.00
## symmetry_worst 0.61 0.53 0.50
## fractal_dimension_worst 0.81 0.69 0.51
## diagnosis -0.59 -0.66 -0.79
## symmetry_worst fractal_dimension_worst diagnosis
## id -0.04 -0.03 -0.04
## radius_mean 0.16 0.01 -0.73
## texture_mean 0.11 0.12 -0.42
## perimeter_mean 0.19 0.05 -0.74
## area_mean 0.14 0.00 -0.71
## smoothness_mean 0.39 0.50 -0.36
## compactness_mean 0.51 0.69 -0.60
## concavity_mean 0.41 0.51 -0.70
## concave.points_mean 0.38 0.37 -0.78
## symmetry_mean 0.70 0.44 -0.33
## fractal_dimension_mean 0.33 0.77 0.01
## radius_se 0.09 0.05 -0.57
## texture_se -0.13 -0.05 0.01
## perimeter_se 0.11 0.09 -0.56
## area_se 0.07 0.02 -0.55
## smoothness_se -0.11 0.10 0.07
## compactness_se 0.28 0.59 -0.29
## concavity_se 0.20 0.44 -0.25
## concave.points_se 0.14 0.31 -0.41
## symmetry_se 0.39 0.08 0.01
## fractal_dimension_se 0.11 0.59 -0.08
## radius_worst 0.24 0.09 -0.78
## texture_worst 0.23 0.22 -0.46
## perimeter_worst 0.27 0.14 -0.78
## area_worst 0.21 0.08 -0.73
## smoothness_worst 0.49 0.62 -0.42
## compactness_worst 0.61 0.81 -0.59
## concavity_worst 0.53 0.69 -0.66
## concave.points_worst 0.50 0.51 -0.79
## symmetry_worst 1.00 0.54 -0.42
## fractal_dimension_worst 0.54 1.00 -0.32
## diagnosis -0.42 -0.32 1.00
It provides a solution for reordering the correlation matrix and displays the significance level on the correlogram.
#It includes also a function for computing a matrix of correlation p-value
ggcorrplot(r)
ggcorrplot(r, hc.order = TRUE, type = "lower",
outline.col = "white",
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"))
Visualising the missing values in the data using naniar
vis_miss(data_cancer)
# as per the above graph there is not missing values lets check this other way
sum(is.na(data_cancer))
## [1] 0
Lets check whther every columns have no missing values
sapply(data_cancer,function(x)sum(is.na(x)))
By using the above three methods it is confirmed that above data has no missing values
Spliting data into training set and test set
split = sample.split(data_cancer$diagnosis, SplitRatio = 0.75)
train_set = subset(data_cancer, split ==TRUE)
test_set = subset(data_cancer, split ==FALSE)
View(train_set)
Feature scaling on few columns : colun 2 to colmn 5
train_set[, 2:5] = scale(train_set[ , 2:5])
test_set[, 2:5] = scale(test_set[ , 2:5])
view(train_set)
data.frame(colnames(data_cancer)) # to know the index number of each colums
## colnames.data_cancer.
## 1 id
## 2 radius_mean
## 3 texture_mean
## 4 perimeter_mean
## 5 area_mean
## 6 smoothness_mean
## 7 compactness_mean
## 8 concavity_mean
## 9 concave.points_mean
## 10 symmetry_mean
## 11 fractal_dimension_mean
## 12 radius_se
## 13 texture_se
## 14 perimeter_se
## 15 area_se
## 16 smoothness_se
## 17 compactness_se
## 18 concavity_se
## 19 concave.points_se
## 20 symmetry_se
## 21 fractal_dimension_se
## 22 radius_worst
## 23 texture_worst
## 24 perimeter_worst
## 25 area_worst
## 26 smoothness_worst
## 27 compactness_worst
## 28 concavity_worst
## 29 concave.points_worst
## 30 symmetry_worst
## 31 fractal_dimension_worst
## 32 diagnosis
Feature scaling on few columns : colun 14 to colmn 15
train_set[, 14:15] = scale(train_set[ , 14:15]) test_set[, 14:15] = scale(test_set[ , 14:15]) view(train_set)
Feature scaling on few columns : colun 22 to colmn 25
train_set[, 22:25] = scale(train_set[ , 22:25])
test_set[, 22:25] = scale(test_set[ , 22:25])
view(train_set)
view(test_set)
Multiple regresssion model :
regressor = lm(diagnosis~.,data = train_set)
#The visreg package provides tools for visualizing these conditional relationships.
#The visreg function takes (1) the model and (2) the variable of interest and plots the conditional relationship, controlling for the other variables. The option gg = TRUE is used to produce a ggplot2 graph.
conditional plot of diagnosis vs. texture mean , we can compare diagnosis with other variable of the data to check relationship
Logistic Regression Model
regressor_lr <- glm(formula = diagnosis ~ .,
family = binomial ,
data=data_cancer)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Predicting the test set results
prob_pred = predict(regressor_lr, type = 'response', newdata = test_set[-32])
y_pred = ifelse(prob_pred > 0.5, 1,0)
Making confusion matrix
cm = table(test_set [ , 32], y_pred)
cm
## y_pred
## 0
## 0 53
## 1 89
SVM Model
library(e1071)
regressor_svm <- svm(formula = diagnosis ~ .,
data=train_set,
type = 'C-classification',
kernel = 'linear')
Predicting the test set results
y_pred1 = predict(regressor_svm, newdata = test_set[-32])
Making confusion matrix
cm = table(test_set [ , 32], y_pred1)
cm
## y_pred1
## 0 1
## 0 51 2
## 1 2 87
// add bootstrap table styles to pandoc tables function bootstrapStylePandocTables() { $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed'); } $(document).ready(function () { bootstrapStylePandocTables(); });