4.
Program to measure central tendency and measures of
dispersion: Mean, median, mode, standard deviation,
variance, mean deviation and quartile deviation for a
frequency distribution/data.
import numpy as np
from scipy import stats
# Sample frequency distribution
data = [(1, 5), (2, 10), (3, 15), (4, 20), (5, 10)]
# Expanding the data based on frequency
expanded_data = []
for value, frequency in data:
expanded_data.extend([value] * frequency)
# Convert to numpy array for calculations
expanded_data = [Link](expanded_data)
# Central Tendency
mean = [Link](expanded_data)
median = [Link](expanded_data)
mode = [Link](expanded_data)[0][0]
# Measures of Dispersion
variance = [Link](expanded_data)
std_deviation = [Link](expanded_data)
mean_deviation = [Link]([Link](expanded_data - mean))
# Quartiles
Q1 = [Link](expanded_data, 25)
Q3 = [Link](expanded_data, 75)
quartile_deviation = (Q3 - Q1) / 2
# Displaying the results
print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")
print(f"Variance: {variance}")
print(f"Standard Deviation: {std_deviation}")
print(f"Mean Deviation: {mean_deviation}")
print(f"Quartile Deviation: {quartile_deviation}")
[Link] to perform cross validation for a given dataset to
measure Root Mean Squared Error (RMSE), Mean Absolute Error
(MAE) and R2 Error using validation set, Leave one out cross-
validation(LOOCV) and k-fold cross-validation approaches.
import numpy as np
from sklearn.model_selection import KFold, LeaveOneOut
from [Link] import mean_squared_error, mean_absolute_error,
r2_score
from sklearn.linear_model import LinearRegression
from [Link] import make_regression
# Generate synthetic data
X, y = make_regression(n_samples=100, n_features=1, noise=10)
# Initialize model
model = LinearRegression()
# K-Fold Cross Validation
kf = KFold(n_splits=5)
for train_index, test_index in [Link](X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
[Link](X_train, y_train)
predictions = [Link](X_test)
print("K-Fold Metrics:")
print("RMSE:", [Link](mean_squared_error(y_test, predictions)))
print("MAE:", mean_absolute_error(y_test, predictions))
print("R-squared:", r2_score(y_test, predictions))
# Leave-One-Out Cross Validation
loo = LeaveOneOut()
for train_index, test_index in [Link](X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
[Link](X_train, y_train)
predictions = [Link](X_test)
print("LOOCV Metrics:")
print("RMSE:", [Link](mean_squared_error(y_test, predictions)))
print("MAE:", mean_absolute_error(y_test, predictions))
print("R-squared:", r2_score(y_test, predictions))
6. Program to display Normal, Binomial Poisson , Bernoulli
distributions for a given frequency distribution and analyze
the results.
import numpy as np
import [Link] as plt
# Function to calculate normal distribution
def normal_distribution(x, mu, sigma):
return (1 / (sigma * [Link](2 * [Link]))) * [Link](-0.5 * ((x - mu) /
sigma) ** 2)
# Function to calculate binomial distribution
def binomial_distribution(n, p, k):
from math import comb
return comb(n, k) * (p ** k) * ((1 - p) ** (n - k))
# Function to calculate Poisson distribution
def poisson_distribution(lmbda, k):
from math import exp, factorial
return (lmbda ** k * exp(-lmbda)) / factorial(k)
# Function to calculate Bernoulli distribution
def bernoulli_distribution(p, k):
return p ** k * (1 - p) ** (1 - k)
# Parameters
mu = 0
sigma = 1
n = 10
p = 0.5
lmbda = 3
# X values for normal distribution
x = [Link](-5, 5, 100)
normal_y = normal_distribution(x, mu, sigma)
# X values for binomial distribution
k_values = [Link](0, n + 1)
binomial_y = [binomial_distribution(n, p, k) for k in k_values]
# X values for Poisson distribution
poisson_k_values = [Link](0, 15)
poisson_y = [poisson_distribution(lmbda, k) for k in
poisson_k_values]
# X values for Bernoulli distribution
bernoulli_k_values = [0, 1]
bernoulli_y = [bernoulli_distribution(p, k) for k in bernoulli_k_values]
# Plotting
[Link](figsize=(12, 8))
# Normal Distribution
[Link](2, 2, 1)
[Link](x, normal_y, label='Normal Distribution', color='blue')
[Link]('Normal Distribution')
[Link]('X')
[Link]('Probability Density')
[Link]()
# Binomial Distribution
[Link](2, 2, 2)
[Link](k_values, binomial_y, label='Binomial Distribution',
color='orange')
[Link]('Binomial Distribution')
[Link]('Number of Successes')
[Link]('Probability')
[Link]()
# Poisson Distribution
[Link](2, 2, 3)
[Link](poisson_k_values, poisson_y, label='Poisson Distribution',
color='green')
[Link]('Poisson Distribution')
[Link]('Number of Events')
[Link]('Probability')
[Link]()
# Bernoulli Distribution
[Link](2, 2, 4)
[Link](bernoulli_k_values, bernoulli_y, label='Bernoulli Distribution',
color='red')
[Link]('Bernoulli Distribution')
[Link]('Outcome')
[Link]('Probability')
[Link](bernoulli_k_values)
[Link]()
plt.tight_layout()
[Link]()
[Link] to implement one Sample,Two Sample and Paired-
Sample t-test for a simple data and analyze the results.
import math
def one_sample_t_test(sample, population_mean):
sample_mean = sum(sample) / len(sample)
sample_std = [Link](sum((x - sample_mean) ** 2 for x in sample) /
(len(sample) - 1))
t_statistic = (sample_mean - population_mean) / (sample_std /
[Link](len(sample)))
return t_statistic, sample_mean
# Sample data
sample_data = [2.3, 2.5, 2.8, 3.0, 2.7]
population_mean = 2.5
t_statistic, sample_mean = one_sample_t_test(sample_data,
population_mean)
print(f"One-Sample T-Test: t-statistic = {t_statistic}, Sample Mean =
{sample_mean}")
def two_sample_t_test(sample1, sample2):
mean1 = sum(sample1) / len(sample1)
mean2 = sum(sample2) / len(sample2)
std1 = [Link](sum((x - mean1) ** 2 for x in sample1) / (len(sample1)
- 1))
std2 = [Link](sum((x - mean2) ** 2 for x in sample2) / (len(sample2)
- 1))
pooled_std = [Link](((len(sample1) - 1) * std1**2 + (len(sample2) -
1) * std2**2) / (len(sample1) + len(sample2) - 2))
t_statistic = (mean1 - mean2) / (pooled_std * [Link](1/len(sample1)
+ 1/len(sample2)))
return t_statistic, mean1, mean2
# Sample data
sample_data1 = [2.3, 2.5, 2.8, 3.0, 2.7]
sample_data2 = [3.1, 3.3, 3.5, 3.7, 3.6]
t_statistic, mean1, mean2 = two_sample_t_test(sample_data1,
sample_data2)
print(f"Two-Sample T-Test: t-statistic = {t_statistic}, Sample Mean 1 =
{mean1}, Sample Mean 2 = {mean2}")
def paired_sample_t_test(sample1, sample2):
differences = [x - y for x, y in zip(sample1, sample2)]
mean_diff = sum(differences) / len(differences)
std_diff = [Link](sum((d - mean_diff) ** 2 for d in differences) /
(len(differences) - 1))
t_statistic = mean_diff / (std_diff / [Link](len(differences)))
return t_statistic, mean_diff
# Sample data
sample_data1 = [2.3, 2.5, 2.8, 3.0, 2.7]
sample_data2 = [2.1, 2.4, 2.6, 2.9, 2.5]
t_statistic, mean_diff = paired_sample_t_test(sample_data1,
sample_data2)
print(f"Paired Sample T-Test: t-statistic = {t_statistic}, Mean Difference =
{mean_diff}")
[Link] to Implement One-Way and Two-way ANOVA test and
analyze the results.
One-Way ANOVA One-way ANOVA is used when comparing the means of
three or more independent groups. The null hypothesis states that all
group means are equal.
Steps to Implement One-Way ANOVA:
Calculate the Group Means: Find the mean of each group.
Calculate the Overall Mean: Find the mean of all data points.
Calculate the Between-Group Variance: This measures how much the
group means deviate from the overall mean.
Calculate the Within-Group Variance: This measures how much the
individual data points deviate from their respective group means.
Calculate the F-statistic: This is the ratio of the between-group variance to
the within-group variance.
import numpy as np
# Sample data for three groups
group1 = [23, 20, 22, 25, 30]
group2 = [30, 32, 29, 35, 31]
group3 = [25, 27, 24, 22, 26]
# Combine groups into a list
data = [group1, group2, group3]
# Calculate means
group_means = [[Link](group) for group in data]
overall_mean = [Link]([item for group in data for item in group])
# Calculate Between-Group Variance
SSB = sum(len(group) * (mean - overall_mean) ** 2 for group, mean in
zip(data, group_means))
# Calculate Within-Group Variance
SSW = sum(sum((x - mean) ** 2 for x in group) for group, mean in
zip(data, group_means))
# Degrees of freedom
df_between = len(data) - 1
df_within = sum(len(group) for group in data) - len(data)
# Mean Squares
MSB = SSB / df_between
MSW = SSW / df_within
# F-statistic
F_statistic = MSB / MSW
print(f"F-statistic for One-Way ANOVA: {F_statistic}")
# Sample data for two factors (A and B)
factor_A = [[23, 20, 22], [30, 32, 29], [25, 27, 24]]
factor_B = [[25, 30, 28], [22, 20, 21], [27, 29, 26]]
# Calculate means
means_A = [[Link]([factor_A[i][j] for i in range(len(factor_A))]) for j in
range(len(factor_A[0]))]
means_B = [[Link]([factor_B[i][j] for i in range(len(factor_B))]) for j in
range(len(factor_B[0]))]
overall_mean = [Link]([item for sublist in factor_A for item in sublist] +
[item for sublist in factor_B for item in sublist])
# Calculate Sum of Squares
SS_A = sum(len(factor_B[0]) * (mean - overall_mean) ** 2 for mean in
means_A)
SS_B = sum(len(factor_A[0]) * (mean - overall_mean) ** 2 for mean in
means_B)
# Interaction Sum of Squares
SS_AB = sum(([Link](factor_A[i]) - overall_mean) ** 2 for i in
range(len(factor_A)))
# Total Sum of Squares
SST = SS_A + SS_B + SS_AB
# Degrees of freedom
df_A = len(factor_A) - 1
df_B = len(factor_B) - 1
df_AB = df_A * df_B
# Mean Squares
MS_A = SS_A / df_A
MS_B = SS_B / df_B
MS_AB = SS_AB / df_AB
# F-statistics
F_A = MS_A / (SST / (len(factor_A) * len(factor_B) - 1))
F_B = MS_B / (SST / (len(factor_A) * len(factor_B) - 1))
print(f"F-statistic for Factor A: {F_A}")
print(f"F-statistic for Factor B: {F_B}")
9. Program to implement correlation, rank correlation and
regression x-y plot and heat maps of correlation matrices.
import numpy as np
import [Link] as plt
# Generating sample data
[Link](0)
x = [Link](100)
y = 2 * x + [Link](0, 0.1, 100) # Linear relationship with
noise
def pearson_correlation(x, y):
n = len(x)
sum_x = [Link](x)
sum_y = [Link](y)
sum_x2 = [Link](x**2)
sum_y2 = [Link](y**2)
sum_xy = [Link](x * y)
numerator = n * sum_xy - sum_x * sum_y
denominator = [Link]((n * sum_x2 - sum_x**2) * (n * sum_y2 -
sum_y**2))
return numerator / denominator
correlation = pearson_correlation(x, y)
print(f"Pearson Correlation Coefficient: {correlation}")
def spearman_rank_correlation(x, y):
rank_x = [Link]([Link](x))
rank_y = [Link]([Link](y))
return pearson_correlation(rank_x, rank_y)
rank_correlation = spearman_rank_correlation(x, y)
print(f"Spearman Rank Correlation Coefficient: {rank_correlation}")
def linear_regression(x, y):
n = len(x)
m = (n * [Link](x * y) - [Link](x) * [Link](y)) / (n * [Link](x**2) -
([Link](x)**2))
b = ([Link](y) - m * [Link](x)) / n
return m, b
slope, intercept = linear_regression(x, y)
print(f"Linear Regression: Slope = {slope}, Intercept = {intercept}")
[Link](x, y, label='Data Points')
[Link](x, slope * x + intercept, color='red', label='Regression Line')
[Link]('X')
[Link]('Y')
[Link]('Scatter Plot with Regression Line')
[Link]()
[Link]()
def plot_correlation_matrix(x, y):
correlation_matrix = [Link](x, y)
[Link](correlation_matrix, cmap='hot', interpolation='nearest')
[Link]()
[Link]('Correlation Matrix Heat Map')
[Link]([0, 1], ['X', 'Y'])
[Link]([0, 1], ['X', 'Y'])
[Link]()
10. Program to implement PCA for Wisconsin dataset,
visualize and analyze the results.
import numpy as np
import pandas as pd
import [Link] as plt
from [Link] import load_breast_cancer
print(df)
# Standardize the data
X_mean = [Link](X, axis=0)
X_std = [Link](X, axis=0)
X_standardized = (X - X_mean) / X_std
# Compute the covariance matrix
cov_matrix = [Link](X_standardized, rowvar=False)
print(cov_matrix)
# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = [Link](cov_matrix)
# Sort the eigenvalues and eigenvectors
sorted_indices = [Link](eigenvalues)[::-1]
eigenvalues_sorted = eigenvalues[sorted_indices]
eigenvectors_sorted = eigenvectors[:, sorted_indices]
# Select the top 2 principal components
k=2
eigenvectors_subset = eigenvectors_sorted[:, :k]
# Transform the data
X_pca = X_standardized.dot(eigenvectors_subset)
# Visualize the PCA results
[Link](figsize=(10, 6))
[Link](X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k',
s=50)
[Link]('PCA of Wisconsin Breast Cancer Dataset')
[Link]('Principal Component 1')
[Link]('Principal Component 2')
[Link](label='Class Label')
[Link]()
[Link]()
11. Program to implement the working of linear
discriminant analysis using IRIS dataset and visualize the
result.
import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as
LDA
# Load the iris dataset
iris = datasets.load_iris()
X = [Link] # Features
y = [Link] # Target classes
print(y)
# Create an instance of LDA
lda = LDA(n_components=2)
# Fit and transform the data
X_lda = lda.fit_transform(X, y)
# Create a DataFrame for visualization
lda_df = [Link](data=X_lda, columns=['LD1', 'LD2'])
lda_df['target'] = y
# Map target values to class names
lda_df['target'] = lda_df['target'].map({0: 'Setosa', 1: 'Versicolor', 2:
'Virginica'})
# Plotting
[Link](figsize=(10, 6))
[Link](data=lda_df, x='LD1', y='LD2', hue='target',
palette='viridis', s=100)
[Link]('LDA of Iris Dataset')
[Link]('Linear Discriminant 1')
[Link]('Linear Discriminant 2')
[Link](title='Species')
[Link]()
[Link]()
12. Program to implement multiple linear regression using
IRIS dataset, visualize and analyze the results.
import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from [Link] import mean_squared_error, r2_score
# Load the iris dataset
iris = sns.load_dataset('iris')
print([Link]())
# Define independent variables (features) and dependent variable (target)
X = iris[['sepal_length', 'sepal_width', 'petal_width']]
y = iris['petal_length']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Create a linear regression model
model = LinearRegression()
[Link](X_train, y_train)
# Make predictions
y_pred = [Link](X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
# Visualize the results
[Link](figsize=(10, 6))
[Link](y_test, y_pred, color='blue')
[Link]([[Link](), [Link]()], [[Link](), [Link]()], color='red', linewidth=2)
[Link]('Actual vs Predicted Petal Length')
[Link]('Actual Petal Length')
[Link]('Predicted Petal Length')
[Link]()
[Link]()