Data Visualization with R
head(iris, 2) # to show first 2 lines
head(iris[4]) # to show only 4th column
head(iris[4,]) # to show only 4th row
head(iris[5,1:3]) # to show only 5th row of first three columns
summary (iris) # to show summary statistics
summary(iris$[Link])
summary(iris$Species)
plot(iris) # scatter plot matrix, # plots the entire dataframe
# clean up 'datasets' package, plots and console
detach("package:datasets", unload = TRUE) #clear base packages. detach is the opposite of
library.
#It disassociates the package from your current session.
[Link]() # clear the plots
#ctrl+L clears the console
rm(list = ls()) # to clear the environment
library(datasets)
?plot #help for plot()
plot(iris$Species) #for plotting categorical data
plot(iris$[Link]) # for plotting quantitative variable
plot(iris$Species, iris$[Link]) #for plotting categorical vs quantitative variable
plot(iris$[Link], iris$[Link]) #for plotting quantitative vs quantitative variable
plot(iris) # for plotting entire dataframe
plot(iris$[Link], iris$[Link],
col = "blue", #color
pch = 1, # solid circle for points # plot character
main = "Iris- petal length vs. petal width", # main title of graph
xlab = "petal length", # lable name of x axis
ylab = "petal width") # lable name of y axis
# plot formula
plot(cos,0,2*pi)
plot(exp,1,5)
plot(dnorm,-5,5) # densitiy of normal distribution
1
plot(dnorm,-5,5,
col = "red",
lwd = 7, # line width to make it thicker
main = "standard normal distribution", # main title of graph
xlab = "z-scores", # lable name of x axis
ylab = "density")
x
detach("package:datasets", unload = TRUE)
library(datasets)
library(help = "datasets")
?mtcars
head(mtcars)
barplot(mtcars$cyl) #doesn't give proper information
# for barplot, we need frequencies for each category. So, create a summary table.
cylinders <-table (mtcars$cyl) #creating table using the command table and storing it in an
object named cylinders
cylinders
barplot(cylinders) #plot bar chart
library(datasets)
?iris
head(iris)
hist(iris$[Link]) # histograms
hist(iris$[Link])
hist(iris$[Link])
hist(iris$[Link])
#histogram by groups
#put graphs in 3 rows and 1 column
par(mfrow = c(3,1)) # par is parameter, c is concatenate which will treat these two numbers
(3,1) as one unit
# histogram for each species
hist(iris$[Link] [iris$Species == "setosa"],
xlim = c(0,3), # x axis should have a scale from 0 to 3
breaks = 9, # suggestion on no. of bars
main = "Petal width for setosa",
xlab = "",
col = "red")
2
hist(iris$[Link] [iris$Species == "versicolor"],
xlim = c(0,3), # x axis should have a scale from 0 to 3
breaks = 9, # suggestion on no. of bars
main = "Petal width for versicolor",
xlab = "",
col = "green")
hist(iris$[Link] [iris$Species == "virginica"],
xlim = c(0,3), # x axis should have a scale from 0 to 3
breaks = 9, # suggestion on no. of bars
main = "Petal width for verginica",
xlab = "",
col = "blue")
par(mfrow = c(1,1)) # Restore back to one graph in one column
detach("package:datasets", unload = TRUE)
rm(list = ls()) # to clear the environment
#Scatter plots
library(datasets)
head(mtcars)
plot(mtcars$wt, mtcars$mpg)
plot(mtcars$wt, mtcars$mpg,
col = "blue", #color
pch = 19, # solid circle
cex = 1.5, # make 150% size
main = "MPG as a function of weight of car",
xlab = "weight",
ylab = "MPG")
detach("package:datasets", unload = TRUE)
# overlaying plots
library(datasets)
?lynx
head(lynx)
plot(lynx)
summary(lynx)
hist(lynx)
3
hist(lynx,
breaks = 15,
freq = FALSE, # Axis will show density, not frequency
col = "green",
main = "lynx data",
xlab = "Number of lynx trapped")
# Add a normal distribution with mean and standard deviation same as that of lynx data
curve(dnorm (x, mean = mean(lynx), sd = sd(lynx)),
col = "red",
lwd = 2,
add = TRUE) #superimpose on previous graph
# Add two kernel density estimators. They are not parametric, instead they follow
distribution of the data.
#So they may have a lot more curves.
lines(density(lynx), col = "blue", lwd = 2) # adjust is 1 point by default
lines(density(lynx, adjust = 3), col = "yellow", lwd = 2) # adjust is average across say, 3 points
# Add a rug plot to show vertical lines for each individual data point
rug(lynx, lwd = 2, col = "black")
# Describing data: n, mean, SD, median, MAD, min/max, skewness, kurtosis, range, 10%
trimmed mean, etc.
# use psych package. It's not a base package
[Link]("pac")
pacman::p_load(dplyr)
[Link]("psych")
describe(iris)
# histogram for each species
hist(iris$[Link] [iris$Species == "setosa"], main = "petal length of setosa")
hist(iris$[Link] [iris$[Link] < 2], main = "petal length of < 2")
hist(iris$[Link] [iris$Species == "virginica" & iris$[Link] < 5.5], main = "petal
length of < 5.5 for virginica")
# create subsample dataframe
# Format: data[rows,columns]
# Leave rows or columns blank to select all
4
[Link] <- iris[iris$Species == "setosa",] # create a new dataframe named [Link], <- is
assignment operator
head([Link])
summary ([Link])
hist([Link]$[Link])
rm(list = ls())
# import data files excel, text, csv, etc.
library(readxl)
mydataset <- read_excel("/Users/abhishek/Documents/IIT Kharagpur
/CEP/List_of_speakers.xlsx")
View(mydataset)
# Use of lattice package to make plots
library(lattice)
mtcars
str(mtcars)
hist(mtcars$mpg)
hist(mtcars$mpg, freq = F) # to plot density plot
histogram(~mpg,data = mtcars) #using lattice
densityplot(~mpg,data = mtcars) #using lattice
lines(density(mtcars$mpg)) # first plot histogram, then only it will work
boxplot(mtcars$mpg)
bwplot(~mpg, data = mtcars) #using lattice
plot(mtcars$mpg, mtcars$wt)
xyplot(wt~mpg, data = mtcars) #using lattice
par(mfrow = c(3,1)) # par is parameter, c is concatenate which will treat these two numbers
(3,1) as one unit
# scatter plot for each species
plot(mtcars$wt [mtcars$gear == "3"], mtcars$mpg [mtcars$gear == "3"])
plot(mtcars$wt [mtcars$gear == "4"], mtcars$mpg [mtcars$gear == "4"])
plot(mtcars$wt [mtcars$gear == "5"], mtcars$mpg [mtcars$gear == "5"])
[Link]()
# make three factors of gear 'named: 3, 4, 5'
fname <- c(3:5)
5
gearfact <- factor(mtcars$gear)
str(gearfact)
xyplot(wt~mpg | gearfact, data = mtcars) #using lattice
bwplot(~mpg | gearfact, data = mtcars) #using lattice
boxplot(mtcars$mpg [mtcars$gear == "3"])
mygears <-table (gearfact)
barplot(mtcars$mpg)
barplot(gears)
# plots using ggplot2
[Link]("ggplot2")
[Link]("pacman")
library(datasets)
library(ggplot2)
library(modeldata)
View(crickets)
ggplot(crickets, aes(x = temp, y = rate , color = species)) + geom_point() +
labs(x = "Temperature", y = "Rate", title = "Cricket data", caption = "Source: McDonald
(2009)", color = "Species") +
scale_color_brewer(palette="Dark2")
ggplot(crickets, aes(x = temp, y = rate , color = species)) +
geom_point(color = "red", size = 3, alpha = 0.6, shape = "square") +
labs(x = "Temperature", y = "Rate", title = "Cricket data", caption = "Source: McDonald
(2009)", color = "Species")
# Adding regression line
ggplot(crickets, aes(x = temp, y = rate , color = species)) + geom_point() +
labs(x = "Temperature", y = "Rate", title = "Cricket data", caption = "Source: McDonald
(2009)", color = "Species") +
scale_color_brewer(palette="Dark2") + geom_smooth(method = "lm", se = F)
ggplot(crickets, aes(x = temp, y = rate)) + geom_point() +
labs(x = "Temperature", y = "Rate", title = "Cricket data", caption = "Source: McDonald
(2009)", color = "Species") +
scale_color_brewer(palette="Dark2") + geom_smooth(method = "lm", se = F) +
geom_smooth(se = F)
cor(crickets$temp,crickets$rate)
Corr<-cor(crickets$temp,crickets$rate)
Corr
6
ggplot(crickets, aes(x = temp, y = rate)) + geom_point() + labs(x = "Temperature", y = "Rate",
title = "Cricket data", caption = "Source: McDonald (2009)") +
geom_smooth(method = "lm", se = F) + geom_text(aes(x = 20, y = 100, label =
paste("Correlation:", round(Corr, 2))))
# other plots- say single quantitative or qualitative variable
ggplot(crickets, aes(x = rate)) + geom_histogram(bins = 15)
ggplot(crickets, aes(x = rate)) + geom_histogram(binwidth = 5)
ggplot(crickets, aes(x = rate)) + geom_freqpoly(bins = 15)
ggplot(crickets, aes(x = species)) + geom_bar()
ggplot(crickets, aes(x = species)) + geom_bar(color = "Black", fill = "green")
ggplot(crickets, aes(x = species, fill = species)) + geom_bar()
ggplot(crickets, aes(x = species, fill = species)) + geom_bar() + scale_fill_brewer(palette =
"Dark2")
ggplot(crickets, aes(x = species, fill = species)) + geom_bar([Link] = F) +
scale_fill_brewer(palette = "Dark2")
# boxplot- for one qualitative and one quantitative variable
ggplot(crickets, aes(x = species, y = rate)) + geom_boxplot()
ggplot(crickets, aes(x = species, y = rate, color = species)) + geom_boxplot()
ggplot(crickets, aes(x = species, y = rate, color = species)) + geom_violin()
ggplot(crickets, aes(x = species, y = rate, color = species)) + geom_boxplot() +
scale_color_brewer(palette = "Dark2")
ggplot(crickets, aes(x = species, y = rate, color = species)) + geom_boxplot() +
scale_color_brewer(palette = "Dark2")
+ theme_minimal() # theme mininal to remove grey background
# faceting – splitting your data into subsets
ggplot(crickets, aes(x = rate, fill = species)) + geom_histogram(bins = 15)
ggplot(crickets, aes(x = rate, fill = species)) + geom_histogram(bins = 15) +
facet_wrap(~species)
ggplot(crickets, aes(x = rate, fill = species)) + geom_histogram(bins = 15) +
facet_wrap(~species, ncol = 1)
# BOD Data
View(BOD)
ggplot(BOD, aes(Time, demand)) + geom_point() + geom_smooth(method = "lm")
BOD %>% ggplot(aes(Time, demand)) + geom_point() + geom_smooth(method = "lm") # %>
% is called pipe operator.
#It takes the output of the expression on its left and passes it as the first argument to the
function on its right.
ggplot(BOD, aes(Time, demand)) + geom_point() + geom_line(color = "red")
7
ggplot(BOD, aes(Time, demand)) + geom_point() + geom_smooth(method = "lm")
BOD %>% ggplot(aes(Time, demand)) + geom_point() + geom_smooth(method = "lm") # %>
% is called pipe operator.
#It takes the output of the expression on its left and passes it as the first argument to the
function on its right.
ggplot(BOD, aes(Time, demand)) + geom_point() + geom_line(color = "red", linetype = 2)
# ggplot for mtcars dataset
View(mtcars)
ggplot(mtcars, aes(x = hp, y = mpg, color = disp)) +
geom_point() +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")
# Add size
ggplot(mtcars, aes(x = hp, y = mpg, size = 3)) +
geom_point() +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")
# Add color
ggplot(mtcars, aes(x = hp, y = mpg, color = factor(cyl))) +
geom_point() +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")
# Add shape
ggplot(mtcars, aes(x = hp, y = mpg, shape = factor(cyl))) +
geom_point() +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")
# Add facet
# Facet Layer- use facet_grid when more than one discrete variables are there, otherwise
use facet_wrap
# Separate rows according to transmission type
p <- ggplot(mtcars, aes(x = hp, y = mpg, shape = factor(cyl))) + geom_point()
8
p + facet_grid(~am) +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")
# Separate columns according to cylinders
p <- ggplot(mtcars, aes(x = hp, y = mpg, shape = factor(cyl))) + geom_point()
p + facet_grid(~cyl) +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")
# Coordinate layers (how data is mapped on plotting plane, including axis scales, aspect
ratios, transformations)
ggplot(data = mtcars, aes(x = wt, y = mpg)) +
geom_point() +
stat_smooth(method = lm, col = "red") +
scale_y_continuous("Miles per Gallon", limits = c(2, 35), expand = c(0, 0)) +
scale_x_continuous("Weight", limits = c(0, 25), expand = c(0, 0)) +
coord_equal() +
labs(title = "Miles per Gallon vs Weight",
x = "Weight",
y = "Miles per Gallon")
# Add coord_cartesian() to proper zoom in
ggplot(mtcars, aes(x = wt, y = hp, color = am)) +
geom_point() + geom_smooth() +
coord_cartesian(xlim = c(3, 6))
# Theme layer
ggplot(mtcars, aes(x = hp, y = mpg)) +
geom_point() +
facet_grid(~cyl) +
theme([Link] = element_rect(fill = "blue", colour = "gray")) +
labs(title = "Miles per Gallon vs Horsepower")
# facet example when more than one categorical variables are there
ggplot(mtcars, aes(x = hp, y = mpg)) +
geom_point() + facet_grid(am ~ cyl) +
theme_gray()+
labs(title = "Miles per Gallon vs Horsepower")
# Contour plots
# Create a 2D density contour plot for the mtcars dataset
9
ggplot(mtcars, aes(x = wt, y = mpg)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon", color = "white") +
scale_fill_viridis_c() +
labs(title = "2D Density Contour Plot of mtcars Dataset",
x = "Weight (wt)",
y = "Miles per Gallon (mpg)",
fill = "Density") +
theme_minimal()
# Creating a panel of plots
library(ggplot2)
[Link](gridExtra)
library(gridExtra)
# Selecting specific columns from mtcars dataset
selected_cols <- c("mpg", "disp", "hp", "drat")
selected_cols <- c("mpg", "disp", "hp", "drat")
# Create histograms for individual variables
hist_plot_mpg <- ggplot(selected_data, aes(x = mpg)) +
geom_histogram(binwidth = 2, fill = "blue", color = "white") +
labs(title = "Histogram: Miles per Gallon", x = "Miles per Gallon", y = "Frequency")
hist_plot_disp <- ggplot(selected_data, aes(x = disp)) +
geom_histogram(binwidth = 50, fill = "red", color = "white") +
labs(title = "Histogram: Displacement", x = "Displacement", y = "Frequency")
hist_plot_hp <- ggplot(selected_data, aes(x = hp)) +
geom_histogram(binwidth = 20, fill = "green", color = "white") +
labs(title = "Histogram: Horsepower", x = "Horsepower", y = "Frequency")
hist_plot_drat <- ggplot(selected_data, aes(x = drat)) +
geom_histogram(binwidth = 0.5, fill = "orange", color = "white") +
labs(title = "Histogram: Drat", x = "Drat", y = "Frequency")
# Arrange the plots in a grid
library(gridExtra)
[Link](hist_plot_mpg, hist_plot_disp, hist_plot_hp, hist_plot_drat,
ncol = 2)
# Save and extract R plots
# Create a plot
plot <- ggplot(mtcars, aes(x = hp, y = mpg)) +
geom_point() +
10
labs(title = "Miles per Gallon vs Horsepower")
# Save the plot as an image file (e.g., PNG)
ggsave("[Link]", plot)
# Save the plot as a PDF file
ggsave("[Link]", plot)
# Extract the plot as a variable for further use
extracted_plot <- plot
plot
# Making Maps
# we need shape files to plot maps.
# Useful website to download .shape files of different countries:
[Link]
[Link]
library(ggmap)
[Link]("ggmap")
[Link]("sf")
library(sf)
[Link](c("sf", "ggplot2", "dplyr", "readxl"))
library(sf)
# Replace with the path to your shapefile or GeoJSON file
india_map <- st_read("path_to_india_shapefile_or_geojson")
library(ggplot2)
ggplot(data = india_map) +
geom_sf() +
theme_minimal() +
labs(title = "Political Map of India")
library(dplyr)
library(readxl)
# Load your data (e.g., literacy rates)
literacy_data <- read_excel("path_to_literacy_data.xlsx")
# Merge spatial data with your dataset
india_data <- india_map %>%
11
left_join(literacy_data, by = c("ST_NM" = "State_Name")) # Adjust column names as
needed
# Plot the map with additional data
ggplot(data = india_data) +
geom_sf(aes(fill = Literacy_Rate)) +
scale_fill_viridis_c() +
theme_minimal() +
labs(title = "Literacy Rate by State in India", fill = "Literacy Rate")
#[Link].2012 <- read_csv('Users/abhishek/Downloads/yellow_tripdata_2015-
[Link]') #'data/san_diego_crime_2012.rds')
library(readxl)
yellow_tripdata_2015_01 <- read_excel("Downloads/yellow_tripdata_2015-[Link]")
View(yellow_tripdata_2015_01)
library(readxl)
yellow_tripdata_2015_01_copy <- read_excel("Downloads/yellow_tripdata_2015-01
[Link]")
View(yellow_tripdata_2015_01_copy)
# extract data for coordinates on map
#[Link] <- filter(yellow_tripdata_2015_01_copy, -70 <= lon & lon <= 75) #, 39 <= latt &
latt <=41)
[Link] <- filter (yellow_tripdata_2015_01_copy$lon)
View([Link])
# quick plot
qmplot(x=lon, y=latt, data = yellow_tripdata_2015_01_copy)
# Child Mortality rate data
library(dplyr)
library(tidyr)
library(readxl)
mort <- read_excel("Documents/IIT Kharagpur /CEP/Data Visualization with
R/[Link]", sheet = "Worksheet")
View(mort)
mort = mort %>% dplyr::rename(country = "...1")
long = mort %>% pivot_longer(cols = -country, names_to = "year", values_to = "morts")
#pivot_longer: reorganized (1760, 1761, 1762, 1763, 1764, …) into (year, morts) [was
197x255, now 50038x3]
View(long)
12
long = long %>% mutate(year = [Link](year)) # mutate: converted 'year' from character
to double (0 new NA)
sweden_long = long %>% filter(country == "Sweden")
ggplot(sweden_long, aes(x = year, y = morts)) + geom_point()
ggplot(sweden_long, aes(x = year, y = morts)) + geom_point(shape=2, color = "red")
ggplot(sweden_long, aes(x = year, y = morts)) + geom_line()
ggplot(sweden_long, aes(x = year, y = morts)) + geom_line(linetype = 2)
ggplot(sweden_long, aes(x = year, y = morts)) + geom_line() + geom_smooth()
sub = long %>%
filter(country %in% c("United States", "United Kingdom", "Sweden",
"Afghanistan", "Rwanda"))
sub %>% ggplot(aes(x = year, y = morts, group = country)) + geom_line()
sub %>% ggplot(aes(x = year, y = morts, group = country)) +
geom_boxplot()
sub %>% ggplot(aes(x = year, y = morts, group = country)) + geom_line() + labs(x = "Year", y
= "Mortality Rate", title = "Child Mortality Rates",
subtitle = "Stratified by Country")
sub %>% ggplot(aes(x = year, y = morts, group = country)) + geom_line() +
labs(x = "Year", y = "Mortality Rate", title = "Child Mortality Rates",
subtitle = "Stratified by Country") +
xlim(c(1900, 2000)) + ylim(c(0, 1.5))
sub %>% ggplot(aes(x = year, y = morts, group = country)) + geom_line() +
labs(x = "Year", y = "Mortality Rate", title = "Child Mortality Rates",
subtitle = "Stratified by Country") +
scale_x_continuous(breaks = seq(1750, 2100, by = 50)) +
scale_y_continuous(breaks = seq(0, 5, by = 0.5))
ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(shape = country))
ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(shape = country)) +
scale_shape_manual(values = c("United States" = 1, "United Kingdom" = 2,
"Sweden" = 3, "Afghanistan" = 4,
"Rwanda" = 5))
ggplot(sub, aes(x = year, y = morts)) + geom_line(aes(linetype = country))
ggplot(sub, aes(x = year, y = morts)) + geom_line(aes(linetype = country)) +
scale_linetype_manual(values = c("United States" = 1, "United Kingdom" = 2,
"Sweden" = 3, "Afghanistan" = 4,
"Rwanda" = 5))
13
ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(color = country))
ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(color = country)) +
scale_color_manual(values = c("United States" = 1, "United Kingdom" = 2,
"Sweden" = 3, "Afghanistan" = 4,
"Rwanda" = 5))
ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(color = country)) +
theme([Link] = "top")
ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(color = country)) +
theme([Link] = c(0.9,0.9))
# Facet
ggplot(sub, aes(x = year, y = morts)) +
geom_point() +f
geom_line(aes(color = country)) +
facet_wrap(~ country) +
scale_color_discrete(guide = FALSE)
ggplot(sub, aes(x = year, y = morts)) +
geom_point() +
geom_line(aes(color = country)) +
facet_wrap(~ country, nrow = 1) +
scale_color_discrete(guide = FALSE)
14