Practical 10
Principal component analysis using R
data("iris")
> str(iris)
'[Link]': 150 obs. of 5 variables:
$ [Link]: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ [Link] : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ [Link]: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ [Link] : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1
1 1 1 1 1 1 ...
> #partition of data
> [Link](111)
> ind <- sample(2, nrow(iris),
+ replace = TRUE,
+ prob = c(0.8, 0.2))
> training <- iris[ind==1,];
> testing <- iris[ind==2,];
> #Scatter Plot & Correlations
> [Link]("psych")
> library(psych)
> [Link](training[,-5],
+ gap = 0,
+ bg = c("red", "yellow", "blue")[training$Species],
+ pch=21)
> #Principal Component Analysis
> pc <- prcomp(training[,-5],
+ center = TRUE,
+ scale. = TRUE)
> attributes(pc)
$names
[1] "sdev" "rotation" "center" "scale"
[5] "x"
$class
[1] "prcomp"
> print(pc)
Standard deviations (1, .., p=4):
[1] 1.7173318 0.9403519 0.3843232 0.1371332
Rotation (n x k) = (4 x 4):
PC1 PC2 PC3
[Link] 0.5147163 -0.39817685 0.7242679
[Link] -0.2926048 -0.91328503 -0.2557463
[Link] 0.5772530 -0.02932037 -0.1755427
[Link] 0.5623421 -0.08065952 -0.6158040
PC4
[Link] 0.2279438
[Link] -0.1220110
[Link] -0.7969342
[Link] 0.5459403
Practical 11
Factor analysis using R
# Load the dataset
> data(iris)
> # View the first few rows of the dataset
> head(iris)
[Link] [Link] [Link]
1 5.1 3.5 1.4
2 4.9 3.0 1.4
3 4.7 3.2 1.3
4 4.6 3.1 1.5
5 5.0 3.6 1.4
6 5.4 3.9 1.7
[Link] Species
1 0.2 setosa
2 0.2 setosa
3 0.2 setosa
4 0.2 setosa
5 0.2 setosa
6 0.4 setosa
> # Scale the data
> iris_scaled <- scale(iris[,1:4])
> # Perform factor analysis
> library(psych)
> fa <- fa(r = iris_scaled,
+ nfactors = 4,
+ rotate = "varimax")
> summary(fa)
Factor analysis with Call: fa(r = iris_scaled, nfactors = 4, rotate
= "varimax")
Test of the hypothesis that 4 factors are sufficient.
The degrees of freedom for the model is -4 and the objective
function was 0
The number of observations was 150 with Chi Square = 0
with prob < NA
The root mean square of the residuals (RMSA) is 0
The df corrected root mean square of the residuals is NA
Tucker Lewis Index of factoring reliability = 1.009
> # View the factor loadings
> fa$loadings
Loadings:
MR1 MR2 MR3 MR4
[Link] 0.997
[Link] -0.108 0.757
[Link] 0.861 -0.413 0.288
[Link] 0.801 -0.317 0.492
MR1 MR2 MR3 MR4
SS loadings 2.389 0.844 0.332 0.000
Proportion Var 0.597 0.211 0.083 0.000
Cumulative Var 0.597 0.808 0.891 0.891
>
LINEAR DISCREMINANT ANALYSIS
library(MASS)
> library(tidyverse)
> library(caret)
> theme_set(theme_classic())
> # Load the data
> data("iris")
> # Split the data into training (80%) and test set (20%)
> [Link](123)
> [Link] <- iris$Species %>%
+ createDataPartition(p = 0.8, list = FALSE)
> [Link] <- iris[[Link], ]
> [Link] <- iris[-[Link], ]
> # Estimate preprocessing parameters
> [Link] <- [Link] %>%
+ preProcess(method = c("center", "scale"))
> # Transform the data using the estimated parameters
> [Link] <- [Link] %>% predict([Link])
> [Link] <- [Link] %>% predict([Link])
> # Fit the model
> model <- lda(Species~., data = [Link])
> # Make predictions
> predictions <- model %>% predict([Link])
> # Model accuracy
> mean(predictions$class==[Link]$Species)
[1] 0.9666667
> model <- lda(Species~., data = [Link])
> model
Call:
lda(Species ~ ., data = [Link])
Prior probabilities of groups:
setosa versicolor virginica
0.3333333 0.3333333 0.3333333
Group means:
[Link] [Link] [Link]
setosa -1.0112835 0.78048647 -1.2900001
versicolor 0.1014181 -0.68674658 0.2566029
virginica 0.9098654 -0.09373989 1.0333972
[Link]
setosa -1.2453195
versicolor 0.1472614
virginica 1.0980581
Coefficients of linear discriminants:
LD1 LD2
[Link] 0.6794973 0.04463786
[Link] 0.6565085 -1.00330120
[Link] -3.8365047 1.44176147
[Link] -2.2722313 -1.96516251
Proportion of trace:
LD1 LD2
0.9902 0.0098
> cluster analysis
library(factoextra)
> # Loading dataset
> df <- mtcars
> # Omitting any NA values
> df <- [Link](df)
> # Scaling dataset
> df <- scale(df)
> # output to be present as PNG file
> png(file = "[Link]")
> km <- kmeans(df, centers = 4, nstart = 25)
> # Visualize the clusters
> fviz_cluster(km, data = df)
> # saving the file
> [Link]()
null device
1
> # output to be present as PNG file
> png(file = "[Link]")
> km <- kmeans(df, centers = 5, nstart = 25)
> # Visualize the clusters
> fviz_cluster(km, data = df)
> # saving the file
> [Link]()
null device
1
>
# Installing the package
[Link]("dplyr")
# Loading package
> library(dplyr)
> # Summary of dataset in package
> summary(mtcars)
mpg cyl disp
Min. :10.40 Min. :4.000 Min. : 71.1
1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8
Median :19.20 Median :6.000 Median :196.3
Mean :20.09 Mean :6.188 Mean :230.7
3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0
Max. :33.90 Max. :8.000 Max. :472.0
hp drat wt
Min. : 52.0 Min. :2.760 Min. :1.513
1st Qu.: 96.5 1st Qu.:3.080 1st Qu.:2.581
Median :123.0 Median :3.695 Median :3.325
Mean :146.7 Mean :3.597 Mean :3.217
3rd Qu.:180.0 3rd Qu.:3.920 3rd Qu.:3.610
Max. :335.0 Max. :4.930 Max. :5.424
qsec vs
Min. :14.50 Min. :0.0000
1st Qu.:16.89 1st Qu.:0.0000
Median :17.71 Median :0.0000
Mean :17.85 Mean :0.4375
3rd Qu.:18.90 3rd Qu.:1.0000
Max. :22.90 Max. :1.0000
am gear carb
Min. :0.0000 Min. :3.000 Min. :1.000
1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
Median :0.0000 Median :4.000 Median :2.000
Mean :0.4062 Mean :3.688 Mean :2.812
3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
Max. :1.0000 Max. :5.000 Max. :8.000
> Installing the package
> # For Logistic regression
> [Link]("caTools")
Error in [Link] : Updating loaded packages
> [Link]("caTools")
> # For ROC curve to evaluate model
> [Link]("ROCR")
Error in [Link] : Updating loaded packages
> [Link]("ROCR")
> # Loading package
> library(caTools)
> library(ROCR)
> # Splitting dataset
> split <- [Link](mtcars, SplitRatio = 0.8)
> split
[1] TRUE TRUE TRUE TRUE FALSE TRUE FALSE
[8] TRUE TRUE TRUE FALSE
> train_reg <- subset(mtcars, split == "TRUE")
> test_reg <- subset(mtcars, split == "FALSE")
> # Training model
> logistic_model <- glm(vs ~ wt + disp,
+ data = train_reg,
+ family = "binomial")
> logistic_model
Call: glm(formula = vs ~ wt + disp, family = "binomial", data =
train_reg)
Coefficients:
(Intercept) wt disp
3.65725 0.50755 -0.02638
Degrees of Freedom: 23 Total (i.e. Null); 21 Residual
Null Deviance: 33.27
Residual Deviance: 17.64 AIC: 23.64
> # Summary
> summary(logistic_model)
Call:
glm(formula = vs ~ wt + disp, family = "binomial", data =
train_reg)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.65725 3.20933 1.140 0.254
wt 0.50755 1.74052 0.292 0.771
disp -0.02638 0.01611 -1.638 0.102
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 33.271 on 23 degrees of freedom
Residual deviance: 17.638 on 21 degrees of freedom
AIC: 23.638
Number of Fisher Scoring iterations: 6
> predict_reg <- predict(logistic_model,
+ test_reg, type = "response")
> predict_reg
Hornet Sportabout Duster 360
0.01642104 0.01752145
Merc 280C Lincoln Continental
0.72757938 0.00325796
Fiat 128 Dodge Challenger
0.93690626 0.05001231
Porsche 914-2 Ford Pantera L
0.82781339 0.01812313
>
# Create the data frame
> data <- [Link](
+ Years_Exp = c(1.1, 1.3, 1.5, 2.0, 2.2, 2.9, 3.0, 3.2, 3.2, 3.7),
+ Salary = c(39343.00, 46205.00, 37731.00, 43525.00,
+ 39891.00, 56642.00, 60150.00, 54445.00, 64445.00,
57189.00)
+)
> # Create the scatter plot
> plot(data$Years_Exp, data$Salary,
+ xlab = "Years Experienced",
+ ylab = "Salary",
+ main = "Scatter Plot of Years Experienced vs Salary")
> [Link]('caTools')
Error in [Link] : Updating loaded packages
> [Link]("caTools")
WARNING: Rtools is required to build R packages but is not
currently installed. Please download and install the appropriate
version of Rtools before proceeding:
[Link]
Warning in [Link] :
package ‘caTools’ is in use and will not be installed
> library(caTools)
> split = [Link](data$Salary, SplitRatio = 0.7)
> trainingset = subset(data, split == TRUE)
> testset = subset(data, split == FALSE)
> # Fitting Simple Linear Regression to the Training set
> lm.r= lm(formula = Salary ~ Years_Exp,
+ data = trainingset)
> #Summary of the model
> summary(lm.r)
Call:
lm(formula = Salary ~ Years_Exp, data = trainingset)
Residuals:
1 2 3 5 6 7
479.5 5713.8 -4387.8 -7924.6 3129.5 5823.7
10
-2834.1
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 29911 5770 5.184 0.00351
Years_Exp 8138 2382 3.417 0.01890
(Intercept) **
Years_Exp *
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5774 on 5 degrees of freedom
Multiple R-squared: 0.7002, Adjusted R-squared:
0.6402
F-statistic: 11.68 on 1 and 5 DF, p-value: 0.0189
> # Create a data frame with new input values
> new_data <- [Link](Years_Exp = c(4.0, 4.5, 5.0))
> # Predict using the linear regression model
> predicted_salaries <- predict(lm.r, newdata = new_data)
> # Display the predicted salaries
> print(predicted_salaries)
1 2 3
62464.62 66533.78 70602.94
>
> # given below is the data pertaining to income, years of service
and tax paid by
#by 10 individuals working in multinomial company
# given below is the data pertaining to income ,years of service and
tax paid by
#by 10 individuals working in multinomial company
Taxdata<-[Link](
taxpaid=c(7,10,8,6,5,5,9,6,9,7),
Income=c(2.5,3,2,2.2,1.8,1.5,1.5,1.9,2.3,1.6),
years=c(9,13,10,10,8,8,12,10,9,5))
# fitting of multiple regression
mulreg1<-lm(Taxdata$taxpaid~Taxdata$Income+Taxdata$years)
mulreg1
summary(mulreg)