0% found this document useful (0 votes)
43 views14 pages

R Data Visualization Techniques

The document provides a comprehensive guide on data visualization using R, covering various plotting techniques with datasets such as iris and mtcars. It includes instructions for creating scatter plots, histograms, bar plots, and box plots, as well as using libraries like ggplot2 and lattice for enhanced visualizations. Additionally, it demonstrates data manipulation, statistical summaries, and the use of different color palettes and themes in plots.

Uploaded by

surya Harsha
Copyright
© All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
43 views14 pages

R Data Visualization Techniques

The document provides a comprehensive guide on data visualization using R, covering various plotting techniques with datasets such as iris and mtcars. It includes instructions for creating scatter plots, histograms, bar plots, and box plots, as well as using libraries like ggplot2 and lattice for enhanced visualizations. Additionally, it demonstrates data manipulation, statistical summaries, and the use of different color palettes and themes in plots.

Uploaded by

surya Harsha
Copyright
© All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd

Data Visualization with R

head(iris, 2) # to show first 2 lines


head(iris[4]) # to show only 4th column
head(iris[4,]) # to show only 4th row
head(iris[5,1:3]) # to show only 5th row of first three columns
summary (iris) # to show summary statistics
summary(iris$[Link])
summary(iris$Species)
plot(iris) # scatter plot matrix, # plots the entire dataframe

# clean up 'datasets' package, plots and console

detach("package:datasets", unload = TRUE) #clear base packages. detach is the opposite of


library.
#It disassociates the package from your current session.

[Link]() # clear the plots

#ctrl+L clears the console

rm(list = ls()) # to clear the environment

library(datasets)

?plot #help for plot()

plot(iris$Species) #for plotting categorical data


plot(iris$[Link]) # for plotting quantitative variable
plot(iris$Species, iris$[Link]) #for plotting categorical vs quantitative variable
plot(iris$[Link], iris$[Link]) #for plotting quantitative vs quantitative variable

plot(iris) # for plotting entire dataframe

plot(iris$[Link], iris$[Link],
col = "blue", #color
pch = 1, # solid circle for points # plot character
main = "Iris- petal length vs. petal width", # main title of graph
xlab = "petal length", # lable name of x axis
ylab = "petal width") # lable name of y axis

# plot formula

plot(cos,0,2*pi)
plot(exp,1,5)
plot(dnorm,-5,5) # densitiy of normal distribution

1
plot(dnorm,-5,5,
col = "red",
lwd = 7, # line width to make it thicker
main = "standard normal distribution", # main title of graph
xlab = "z-scores", # lable name of x axis
ylab = "density")
x
detach("package:datasets", unload = TRUE)
library(datasets)
library(help = "datasets")

?mtcars
head(mtcars)
barplot(mtcars$cyl) #doesn't give proper information

# for barplot, we need frequencies for each category. So, create a summary table.
cylinders <-table (mtcars$cyl) #creating table using the command table and storing it in an
object named cylinders
cylinders
barplot(cylinders) #plot bar chart

library(datasets)
?iris
head(iris)

hist(iris$[Link]) # histograms
hist(iris$[Link])
hist(iris$[Link])
hist(iris$[Link])

#histogram by groups

#put graphs in 3 rows and 1 column

par(mfrow = c(3,1)) # par is parameter, c is concatenate which will treat these two numbers
(3,1) as one unit

# histogram for each species

hist(iris$[Link] [iris$Species == "setosa"],


xlim = c(0,3), # x axis should have a scale from 0 to 3
breaks = 9, # suggestion on no. of bars
main = "Petal width for setosa",
xlab = "",
col = "red")

2
hist(iris$[Link] [iris$Species == "versicolor"],
xlim = c(0,3), # x axis should have a scale from 0 to 3
breaks = 9, # suggestion on no. of bars
main = "Petal width for versicolor",
xlab = "",
col = "green")

hist(iris$[Link] [iris$Species == "virginica"],


xlim = c(0,3), # x axis should have a scale from 0 to 3
breaks = 9, # suggestion on no. of bars
main = "Petal width for verginica",
xlab = "",
col = "blue")

par(mfrow = c(1,1)) # Restore back to one graph in one column

detach("package:datasets", unload = TRUE)


rm(list = ls()) # to clear the environment

#Scatter plots

library(datasets)
head(mtcars)

plot(mtcars$wt, mtcars$mpg)

plot(mtcars$wt, mtcars$mpg,
col = "blue", #color
pch = 19, # solid circle
cex = 1.5, # make 150% size
main = "MPG as a function of weight of car",
xlab = "weight",
ylab = "MPG")

detach("package:datasets", unload = TRUE)

# overlaying plots

library(datasets)
?lynx
head(lynx)
plot(lynx)
summary(lynx)
hist(lynx)

3
hist(lynx,
breaks = 15,
freq = FALSE, # Axis will show density, not frequency
col = "green",
main = "lynx data",
xlab = "Number of lynx trapped")

# Add a normal distribution with mean and standard deviation same as that of lynx data

curve(dnorm (x, mean = mean(lynx), sd = sd(lynx)),


col = "red",
lwd = 2,
add = TRUE) #superimpose on previous graph

# Add two kernel density estimators. They are not parametric, instead they follow
distribution of the data.
#So they may have a lot more curves.

lines(density(lynx), col = "blue", lwd = 2) # adjust is 1 point by default

lines(density(lynx, adjust = 3), col = "yellow", lwd = 2) # adjust is average across say, 3 points

# Add a rug plot to show vertical lines for each individual data point

rug(lynx, lwd = 2, col = "black")

# Describing data: n, mean, SD, median, MAD, min/max, skewness, kurtosis, range, 10%
trimmed mean, etc.
# use psych package. It's not a base package

[Link]("pac")
pacman::p_load(dplyr)

[Link]("psych")
describe(iris)

# histogram for each species

hist(iris$[Link] [iris$Species == "setosa"], main = "petal length of setosa")


hist(iris$[Link] [iris$[Link] < 2], main = "petal length of < 2")

hist(iris$[Link] [iris$Species == "virginica" & iris$[Link] < 5.5], main = "petal


length of < 5.5 for virginica")

# create subsample dataframe


# Format: data[rows,columns]
# Leave rows or columns blank to select all

4
[Link] <- iris[iris$Species == "setosa",] # create a new dataframe named [Link], <- is
assignment operator

head([Link])
summary ([Link])
hist([Link]$[Link])

rm(list = ls())

# import data files excel, text, csv, etc.

library(readxl)
mydataset <- read_excel("/Users/abhishek/Documents/IIT Kharagpur
/CEP/List_of_speakers.xlsx")
View(mydataset)

# Use of lattice package to make plots


library(lattice)
mtcars
str(mtcars)
hist(mtcars$mpg)
hist(mtcars$mpg, freq = F) # to plot density plot
histogram(~mpg,data = mtcars) #using lattice
densityplot(~mpg,data = mtcars) #using lattice
lines(density(mtcars$mpg)) # first plot histogram, then only it will work

boxplot(mtcars$mpg)
bwplot(~mpg, data = mtcars) #using lattice

plot(mtcars$mpg, mtcars$wt)
xyplot(wt~mpg, data = mtcars) #using lattice

par(mfrow = c(3,1)) # par is parameter, c is concatenate which will treat these two numbers
(3,1) as one unit

# scatter plot for each species

plot(mtcars$wt [mtcars$gear == "3"], mtcars$mpg [mtcars$gear == "3"])


plot(mtcars$wt [mtcars$gear == "4"], mtcars$mpg [mtcars$gear == "4"])
plot(mtcars$wt [mtcars$gear == "5"], mtcars$mpg [mtcars$gear == "5"])
[Link]()
# make three factors of gear 'named: 3, 4, 5'

fname <- c(3:5)

5
gearfact <- factor(mtcars$gear)
str(gearfact)
xyplot(wt~mpg | gearfact, data = mtcars) #using lattice
bwplot(~mpg | gearfact, data = mtcars) #using lattice
boxplot(mtcars$mpg [mtcars$gear == "3"])

mygears <-table (gearfact)


barplot(mtcars$mpg)
barplot(gears)

# plots using ggplot2

[Link]("ggplot2")
[Link]("pacman")
library(datasets)
library(ggplot2)
library(modeldata)

View(crickets)
ggplot(crickets, aes(x = temp, y = rate , color = species)) + geom_point() +
labs(x = "Temperature", y = "Rate", title = "Cricket data", caption = "Source: McDonald
(2009)", color = "Species") +
scale_color_brewer(palette="Dark2")

ggplot(crickets, aes(x = temp, y = rate , color = species)) +


geom_point(color = "red", size = 3, alpha = 0.6, shape = "square") +
labs(x = "Temperature", y = "Rate", title = "Cricket data", caption = "Source: McDonald
(2009)", color = "Species")

# Adding regression line


ggplot(crickets, aes(x = temp, y = rate , color = species)) + geom_point() +
labs(x = "Temperature", y = "Rate", title = "Cricket data", caption = "Source: McDonald
(2009)", color = "Species") +
scale_color_brewer(palette="Dark2") + geom_smooth(method = "lm", se = F)

ggplot(crickets, aes(x = temp, y = rate)) + geom_point() +


labs(x = "Temperature", y = "Rate", title = "Cricket data", caption = "Source: McDonald
(2009)", color = "Species") +
scale_color_brewer(palette="Dark2") + geom_smooth(method = "lm", se = F) +
geom_smooth(se = F)

cor(crickets$temp,crickets$rate)
Corr<-cor(crickets$temp,crickets$rate)
Corr

6
ggplot(crickets, aes(x = temp, y = rate)) + geom_point() + labs(x = "Temperature", y = "Rate",
title = "Cricket data", caption = "Source: McDonald (2009)") +
geom_smooth(method = "lm", se = F) + geom_text(aes(x = 20, y = 100, label =
paste("Correlation:", round(Corr, 2))))

# other plots- say single quantitative or qualitative variable

ggplot(crickets, aes(x = rate)) + geom_histogram(bins = 15)


ggplot(crickets, aes(x = rate)) + geom_histogram(binwidth = 5)
ggplot(crickets, aes(x = rate)) + geom_freqpoly(bins = 15)
ggplot(crickets, aes(x = species)) + geom_bar()
ggplot(crickets, aes(x = species)) + geom_bar(color = "Black", fill = "green")
ggplot(crickets, aes(x = species, fill = species)) + geom_bar()
ggplot(crickets, aes(x = species, fill = species)) + geom_bar() + scale_fill_brewer(palette =
"Dark2")
ggplot(crickets, aes(x = species, fill = species)) + geom_bar([Link] = F) +
scale_fill_brewer(palette = "Dark2")

# boxplot- for one qualitative and one quantitative variable

ggplot(crickets, aes(x = species, y = rate)) + geom_boxplot()


ggplot(crickets, aes(x = species, y = rate, color = species)) + geom_boxplot()
ggplot(crickets, aes(x = species, y = rate, color = species)) + geom_violin()
ggplot(crickets, aes(x = species, y = rate, color = species)) + geom_boxplot() +
scale_color_brewer(palette = "Dark2")
ggplot(crickets, aes(x = species, y = rate, color = species)) + geom_boxplot() +
scale_color_brewer(palette = "Dark2")
+ theme_minimal() # theme mininal to remove grey background

# faceting – splitting your data into subsets

ggplot(crickets, aes(x = rate, fill = species)) + geom_histogram(bins = 15)

ggplot(crickets, aes(x = rate, fill = species)) + geom_histogram(bins = 15) +


facet_wrap(~species)
ggplot(crickets, aes(x = rate, fill = species)) + geom_histogram(bins = 15) +
facet_wrap(~species, ncol = 1)

# BOD Data
View(BOD)
ggplot(BOD, aes(Time, demand)) + geom_point() + geom_smooth(method = "lm")
BOD %>% ggplot(aes(Time, demand)) + geom_point() + geom_smooth(method = "lm") # %>
% is called pipe operator.
#It takes the output of the expression on its left and passes it as the first argument to the
function on its right.
ggplot(BOD, aes(Time, demand)) + geom_point() + geom_line(color = "red")

7
ggplot(BOD, aes(Time, demand)) + geom_point() + geom_smooth(method = "lm")
BOD %>% ggplot(aes(Time, demand)) + geom_point() + geom_smooth(method = "lm") # %>
% is called pipe operator.
#It takes the output of the expression on its left and passes it as the first argument to the
function on its right.
ggplot(BOD, aes(Time, demand)) + geom_point() + geom_line(color = "red", linetype = 2)

# ggplot for mtcars dataset

View(mtcars)

ggplot(mtcars, aes(x = hp, y = mpg, color = disp)) +


geom_point() +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")

# Add size
ggplot(mtcars, aes(x = hp, y = mpg, size = 3)) +
geom_point() +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")

# Add color
ggplot(mtcars, aes(x = hp, y = mpg, color = factor(cyl))) +
geom_point() +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")

# Add shape
ggplot(mtcars, aes(x = hp, y = mpg, shape = factor(cyl))) +
geom_point() +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")

# Add facet

# Facet Layer- use facet_grid when more than one discrete variables are there, otherwise
use facet_wrap
# Separate rows according to transmission type
p <- ggplot(mtcars, aes(x = hp, y = mpg, shape = factor(cyl))) + geom_point()

8
p + facet_grid(~am) +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")

# Separate columns according to cylinders


p <- ggplot(mtcars, aes(x = hp, y = mpg, shape = factor(cyl))) + geom_point()

p + facet_grid(~cyl) +
labs(title = "Miles per Gallon vs Horsepower",
x = "Horsepower",
y = "Miles per Gallon")

# Coordinate layers (how data is mapped on plotting plane, including axis scales, aspect
ratios, transformations)

ggplot(data = mtcars, aes(x = wt, y = mpg)) +


geom_point() +
stat_smooth(method = lm, col = "red") +
scale_y_continuous("Miles per Gallon", limits = c(2, 35), expand = c(0, 0)) +
scale_x_continuous("Weight", limits = c(0, 25), expand = c(0, 0)) +
coord_equal() +
labs(title = "Miles per Gallon vs Weight",
x = "Weight",
y = "Miles per Gallon")

# Add coord_cartesian() to proper zoom in


ggplot(mtcars, aes(x = wt, y = hp, color = am)) +
geom_point() + geom_smooth() +
coord_cartesian(xlim = c(3, 6))

# Theme layer
ggplot(mtcars, aes(x = hp, y = mpg)) +
geom_point() +
facet_grid(~cyl) +
theme([Link] = element_rect(fill = "blue", colour = "gray")) +
labs(title = "Miles per Gallon vs Horsepower")

# facet example when more than one categorical variables are there
ggplot(mtcars, aes(x = hp, y = mpg)) +
geom_point() + facet_grid(am ~ cyl) +
theme_gray()+
labs(title = "Miles per Gallon vs Horsepower")

# Contour plots
# Create a 2D density contour plot for the mtcars dataset

9
ggplot(mtcars, aes(x = wt, y = mpg)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon", color = "white") +
scale_fill_viridis_c() +
labs(title = "2D Density Contour Plot of mtcars Dataset",
x = "Weight (wt)",
y = "Miles per Gallon (mpg)",
fill = "Density") +
theme_minimal()

# Creating a panel of plots

library(ggplot2)
[Link](gridExtra)
library(gridExtra)

# Selecting specific columns from mtcars dataset


selected_cols <- c("mpg", "disp", "hp", "drat")
selected_cols <- c("mpg", "disp", "hp", "drat")

# Create histograms for individual variables


hist_plot_mpg <- ggplot(selected_data, aes(x = mpg)) +
geom_histogram(binwidth = 2, fill = "blue", color = "white") +
labs(title = "Histogram: Miles per Gallon", x = "Miles per Gallon", y = "Frequency")

hist_plot_disp <- ggplot(selected_data, aes(x = disp)) +


geom_histogram(binwidth = 50, fill = "red", color = "white") +
labs(title = "Histogram: Displacement", x = "Displacement", y = "Frequency")

hist_plot_hp <- ggplot(selected_data, aes(x = hp)) +


geom_histogram(binwidth = 20, fill = "green", color = "white") +
labs(title = "Histogram: Horsepower", x = "Horsepower", y = "Frequency")

hist_plot_drat <- ggplot(selected_data, aes(x = drat)) +


geom_histogram(binwidth = 0.5, fill = "orange", color = "white") +
labs(title = "Histogram: Drat", x = "Drat", y = "Frequency")

# Arrange the plots in a grid

library(gridExtra)
[Link](hist_plot_mpg, hist_plot_disp, hist_plot_hp, hist_plot_drat,
ncol = 2)

# Save and extract R plots


# Create a plot
plot <- ggplot(mtcars, aes(x = hp, y = mpg)) +
geom_point() +

10
labs(title = "Miles per Gallon vs Horsepower")

# Save the plot as an image file (e.g., PNG)


ggsave("[Link]", plot)

# Save the plot as a PDF file


ggsave("[Link]", plot)

# Extract the plot as a variable for further use


extracted_plot <- plot
plot

# Making Maps
# we need shape files to plot maps.
# Useful website to download .shape files of different countries:

[Link]
[Link]

library(ggmap)
[Link]("ggmap")
[Link]("sf")
library(sf)

[Link](c("sf", "ggplot2", "dplyr", "readxl"))


library(sf)

# Replace with the path to your shapefile or GeoJSON file


india_map <- st_read("path_to_india_shapefile_or_geojson")

library(ggplot2)

ggplot(data = india_map) +
geom_sf() +
theme_minimal() +
labs(title = "Political Map of India")

library(dplyr)
library(readxl)

# Load your data (e.g., literacy rates)


literacy_data <- read_excel("path_to_literacy_data.xlsx")

# Merge spatial data with your dataset


india_data <- india_map %>%

11
left_join(literacy_data, by = c("ST_NM" = "State_Name")) # Adjust column names as
needed

# Plot the map with additional data


ggplot(data = india_data) +
geom_sf(aes(fill = Literacy_Rate)) +
scale_fill_viridis_c() +
theme_minimal() +
labs(title = "Literacy Rate by State in India", fill = "Literacy Rate")

#[Link].2012 <- read_csv('Users/abhishek/Downloads/yellow_tripdata_2015-


[Link]') #'data/san_diego_crime_2012.rds')

library(readxl)
yellow_tripdata_2015_01 <- read_excel("Downloads/yellow_tripdata_2015-[Link]")
View(yellow_tripdata_2015_01)

library(readxl)
yellow_tripdata_2015_01_copy <- read_excel("Downloads/yellow_tripdata_2015-01
[Link]")
View(yellow_tripdata_2015_01_copy)

# extract data for coordinates on map

#[Link] <- filter(yellow_tripdata_2015_01_copy, -70 <= lon & lon <= 75) #, 39 <= latt &
latt <=41)
[Link] <- filter (yellow_tripdata_2015_01_copy$lon)
View([Link])
# quick plot
qmplot(x=lon, y=latt, data = yellow_tripdata_2015_01_copy)

# Child Mortality rate data


library(dplyr)
library(tidyr)
library(readxl)
mort <- read_excel("Documents/IIT Kharagpur /CEP/Data Visualization with
R/[Link]", sheet = "Worksheet")
View(mort)

mort = mort %>% dplyr::rename(country = "...1")


long = mort %>% pivot_longer(cols = -country, names_to = "year", values_to = "morts")
#pivot_longer: reorganized (1760, 1761, 1762, 1763, 1764, …) into (year, morts) [was
197x255, now 50038x3]
View(long)

12
long = long %>% mutate(year = [Link](year)) # mutate: converted 'year' from character
to double (0 new NA)

sweden_long = long %>% filter(country == "Sweden")


ggplot(sweden_long, aes(x = year, y = morts)) + geom_point()
ggplot(sweden_long, aes(x = year, y = morts)) + geom_point(shape=2, color = "red")
ggplot(sweden_long, aes(x = year, y = morts)) + geom_line()
ggplot(sweden_long, aes(x = year, y = morts)) + geom_line(linetype = 2)
ggplot(sweden_long, aes(x = year, y = morts)) + geom_line() + geom_smooth()

sub = long %>%


filter(country %in% c("United States", "United Kingdom", "Sweden",
"Afghanistan", "Rwanda"))
sub %>% ggplot(aes(x = year, y = morts, group = country)) + geom_line()
sub %>% ggplot(aes(x = year, y = morts, group = country)) +
geom_boxplot()

sub %>% ggplot(aes(x = year, y = morts, group = country)) + geom_line() + labs(x = "Year", y
= "Mortality Rate", title = "Child Mortality Rates",
subtitle = "Stratified by Country")
sub %>% ggplot(aes(x = year, y = morts, group = country)) + geom_line() +
labs(x = "Year", y = "Mortality Rate", title = "Child Mortality Rates",
subtitle = "Stratified by Country") +
xlim(c(1900, 2000)) + ylim(c(0, 1.5))

sub %>% ggplot(aes(x = year, y = morts, group = country)) + geom_line() +


labs(x = "Year", y = "Mortality Rate", title = "Child Mortality Rates",

subtitle = "Stratified by Country") +


scale_x_continuous(breaks = seq(1750, 2100, by = 50)) +
scale_y_continuous(breaks = seq(0, 5, by = 0.5))

ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(shape = country))

ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(shape = country)) +


scale_shape_manual(values = c("United States" = 1, "United Kingdom" = 2,
"Sweden" = 3, "Afghanistan" = 4,
"Rwanda" = 5))

ggplot(sub, aes(x = year, y = morts)) + geom_line(aes(linetype = country))


ggplot(sub, aes(x = year, y = morts)) + geom_line(aes(linetype = country)) +
scale_linetype_manual(values = c("United States" = 1, "United Kingdom" = 2,
"Sweden" = 3, "Afghanistan" = 4,
"Rwanda" = 5))

13
ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(color = country))
ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(color = country)) +
scale_color_manual(values = c("United States" = 1, "United Kingdom" = 2,
"Sweden" = 3, "Afghanistan" = 4,
"Rwanda" = 5))

ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(color = country)) +


theme([Link] = "top")
ggplot(sub, aes(x = year, y = morts)) + geom_point(aes(color = country)) +
theme([Link] = c(0.9,0.9))

# Facet

ggplot(sub, aes(x = year, y = morts)) +


geom_point() +f
geom_line(aes(color = country)) +
facet_wrap(~ country) +
scale_color_discrete(guide = FALSE)

ggplot(sub, aes(x = year, y = morts)) +


geom_point() +
geom_line(aes(color = country)) +
facet_wrap(~ country, nrow = 1) +
scale_color_discrete(guide = FALSE)

14

You might also like