In [1]: import pandas as pd
import numpy as np
import seaborn as sns
import [Link] as plt
# Create a synthetic dataset
data = {
'StudentID': range(1, 11),
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Hannah', 'Ian', 'Jane'],
'Gender': ['F', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'F'],
'Age': [17, 16, [Link], 17, 16, 15, 16, 15, 14, 17],
'Hours_Studied': [15, 10, 5, [Link], 12, 9, 18, 20, 1, 16],
'Attendance_Rate': [95, 88, 92, 85, [Link], 90, 97, 99, 45, 94],
'Exam_Score': [88, 72, 65, 55, 78, 69, 91, 95, 30, 89]
}
df = [Link](data)
print(df)
StudentID Name Gender Age Hours_Studied Attendance_Rate Exam_Score
0 1 Alice F 17.0 15.0 95.0 88
1 2 Bob M 16.0 10.0 88.0 72
2 3 Charlie M NaN 5.0 92.0 65
3 4 David M 17.0 NaN 85.0 55
4 5 Eva F 16.0 12.0 NaN 78
5 6 Frank M 15.0 9.0 90.0 69
6 7 Grace F 16.0 18.0 97.0 91
7 8 Hannah F 15.0 20.0 99.0 95
8 9 Ian M 14.0 1.0 45.0 30
9 10 Jane F 17.0 16.0 94.0 89
In [2]: # Show missing values
print("\nMissing Values:\n", [Link]().sum())
# Fill missing Age with median
df['Age'] = df['Age'].fillna(df['Age'].median())
# Fill missing Hours_Studied and Attendance_Rate with mean
df['Hours_Studied'] = df['Hours_Studied'].fillna(df['Hours_Studied'].mean())
df['Attendance_Rate'] = df['Attendance_Rate'].fillna(df['Attendance_Rate'].mean())
Missing Values:
StudentID 0
Name 0
Gender 0
Age 1
Hours_Studied 1
Attendance_Rate 1
Exam_Score 0
dtype: int64
In [3]: # Visualize outliers
[Link](data=df[['Hours_Studied', 'Attendance_Rate', 'Exam_Score']])
[Link]("Boxplot - Academic Variables")
[Link]()
# Detect and treat outliers using IQR method
def cap_outliers(series):
Q1 = [Link](0.25)
Q3 = [Link](0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
return [Link](lower, upper)
# Apply outlier capping
df['Hours_Studied'] = cap_outliers(df['Hours_Studied'])
df['Exam_Score'] = cap_outliers(df['Exam_Score'])
df['Attendance_Rate'] = cap_outliers(df['Attendance_Rate'])
In [4]: # Check skewness before
print("Skewness before:", df['Hours_Studied'].skew())
# Apply log transformation (+1 to avoid log(0))
df['Log_Hours_Studied'] = np.log1p(df['Hours_Studied'])
# Check skewness after
print("Skewness after:", df['Log_Hours_Studied'].skew())
# Plot before vs after
[Link](figsize=(10, 4))
[Link](1, 2, 1)
[Link](df['Hours_Studied'], kde=True)
[Link]("Original: Hours_Studied")
[Link](1, 2, 2)
[Link](df['Log_Hours_Studied'], kde=True)
[Link]("Log Transformed: Hours_Studied")
[Link]()
Skewness before: -0.47615721184649995
Skewness after: -1.8365988265148336
In [5]: print([Link]())
StudentID Name Gender Age Hours_Studied Attendance_Rate \
0 1 Alice F 17.0 15.000000 95.000000
1 2 Bob M 16.0 10.000000 88.000000
2 3 Charlie M 16.0 5.000000 92.000000
3 4 David M 17.0 11.777778 85.000000
4 5 Eva F 16.0 12.000000 87.222222
Exam_Score Log_Hours_Studied
0 88.0 2.772589
1 72.0 2.397895
2 65.0 1.791759
3 55.0 2.547708
4 78.0 2.564949