mpg | cyl | disp | am | gear | |
---|---|---|---|---|---|
Mazda RX4 | 21 | 6 | 160 | 1 | 4 |
Mazda RX4 Wag | 21 | 6 | 160 | 1 | 4 |
Datsun 710 | 22.8 | 4 | 108 | 1 | 4 |
Hornet 4 Drive | 21.4 | 6 | 258 | 0 | 3 |
Hornet Sportabout | 18.7 | 8 | 360 | 0 | 3 |
Valiant | 18.1 | 6 | 225 | 0 | 3 |
Duster 360 | 14.3 | 8 | 360 | 0 | 3 |
mtcars$mpg
Returns a vector of the values of the mpg
variable in top-to-bottom row order.
For a comma separated values file you can simply
mydata <- read.csv("patterns.csv")
Import functions exist for Excel
library(xlsx)
mydata <- read.xlsx("patterns.xlsx", sheetName = "all-data")
and SPSS, SAS and several other file formats.
We can get dataframe information by column, row or create a slice.
# by column
mtcars$gear
mtcars[,"gear"]
# by row
mtcars[1,]
mtcars["Fiat 128",]
# dataframe slice
mtcars[c("gear", "mpg")]
Let's plot the number of car models by their cylinder count.
p <- ggplot(mtcars, aes(x=factor(cyl))) + geom_bar()
plot(p)
Will plot the count of cars that share a certain cylinder count.
Let's plot the number of car models by their cylinder count.
p <- ggplot(mtcars, aes(x=factor(cyl), y=mpg)) + geom_point()
plot(p)
Requires both x
and y
aesthetics.
Let's plot the number of car models by their cylinder count.
p <- ggplot(mtcars, aes(x=wt, y=mpg)) + geom_line()
plot(p)
p <- ggplot(mtcars, aes(x=factor(cyl), y=mpg))
p + geom_boxplot()
ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=1)
A piechart is a barchart with a single stacked bar
pie <- ggplot(mtcars, aes(x = 1, fill = factor(cyl))) + geom_bar(position="stack")
Plotted on a different coordinate space i.e. polar.
pie <- ggplot(mtcars, aes(x = 1, fill = factor(cyl))) + geom_bar(position="stack") + coord_polar(theta = "y")
Plot a histogram with an overlaid normal distribution:
ggplot(mtcars, aes(x=mpg)) +
geom_histogram(aes(y = ..density..), binwidth=1) +
stat_function(fun=dnorm,
aes(colour = "red"),
args = with(mtcars, c(mean = mean(mpg), sd = sd(mpg)))
) +
labs(x="Miles per gallon", legend.position = "bottom", legend.direction = "horizontal")
Or similarly, let's use mean mpg
as our y
aesthetic. First we have to reshape our data.
require(ggplot2)
require(reshape2)
plot.data <- melt(tapply(mtcars$mpg, factor(mtcars$cyl),mean), varnames="cyl", value.name="mean")
ggplot(plot.data, aes(x=factor(cyl),y=mean)) + geom_bar(stat="identity")
Or
ggplot(mtcars, aes(y=mpg, x=factor(cyl), group=factor(cyl))) + stat_summary(fun.y=mean, geom="bar")
ggplot(mtcars, aes(x=mpg, y=disp)) + geom_point()
ggplot(mtcars, aes(x=mpg, y=disp)) + geom_point() + geom_smooth()
ggplot(mtcars, aes(x=mpg, y=disp)) + geom_point() + geom_smooth() + coord_flip()
ggplot(mtcars, aes(x=mpg, y=disp)) + geom_point(aes(color=factor(am))) + geom_smooth() + coord_flip()
ggplot(mtcars, aes(x=mpg, y=disp)) + geom_point(aes(color=factor(am))) + stat_smooth(method="lm") + coord_flip()
ggplotRegression <- function (fit) {
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
"Intercept =",signif(fit$coef[[1]],5 ),
" Slope =",signif(fit$coef[[2]], 5),
" P =",signif(summary(fit)$coef[2,4], 5)))
}
fit <- lm(mpg~disp, data=mtcars)
ggplotRegression(fit)
ggplot(mtcars, aes(x=mpg, y=disp)) + geom_point()
because of sane defaults.> p <- ggplot(mtcars, aes(x=cyl))
> summary(p)
data: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, x [32x12]
mapping: x = cyl
faceting: facet_null()
We can add count
as a statistic layer to our plot
> q <- p + stat_count()
> summary(q)
data: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, x [32x12]
mapping: x = cyl
faceting: facet_null()
-----------------------------------
geom_bar: na.rm = FALSE, width = NULL
stat_count: na.rm = FALSE, width = NULL
position_stack
We can choose a geometry to combine with our data and statistic:
> q <- p + geom_bar()
> summary(q)
data: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, x [32x12]
mapping: x = cyl
faceting: facet_null()
-----------------------------------
geom_bar: na.rm = FALSE, width = NULL
stat_count: na.rm = FALSE, width = NULL
position_stack
We've made no difference yet, as adding stat_count
gives a default of geom_bar
.
df <- data.frame(
x = c(3, 1, 5),
y = c(2, 4, 6),
label = c("a","b","c")
)
p <- ggplot(df, aes(x, y, label = label)) + xlab(NULL) + ylab(NULL)
p + geom_point() + labs(title = "geom_point")
p + geom_bar(stat="identity") + labs(title = "geom_bar(stat=\"identity\")")
p + geom_line() + labs(title = "geom_line")
p + geom_area() + labs(title = "geom_area")
p + geom_path() + labs(title = "geom_path")
p + geom_text() + labs(title = "geom_text")
p + geom_tile() + labs(title = "geom_tile")
p + geom_polygon() + labs(title = "geom_polygon")
The "cheat sheet" has many more types.
Adding a scale modifies the axes or:
p + scale_fill_brewer()
Implementing many of Tufte's guidelines becomes:
p + geom_bar(stat="identity") + labs(title = "geom_bar(stat=\"identity\")") + theme_minimal()
p + geom_bar(stat="identity") + labs(title = "geom_bar(stat=\"identity\")") + theme_bw()
or my favourite
p + xkcdrect(
+ aes(xmin = x, xmax = x+1, ymin = 0, ymax = y),
+ df
+ ) + theme_xkcd()
There are three exercises for you to tackle, available at http://aidandelaney.github.io/handouts/2016DiagramsRTutorial-questions.pdf.
Becker, R, and J Chambers. 1984. S: An Interactive Environment for Data Analysis and Graphics. Wadsworth & Brooks/Cole.
Ihaka, R, and R Gentleman. 1996. “R: A Language for Data Analysis and Graphics.” Journal of Computational and Graphical Statistics, no. 5: 299–314.
Wickham, Hadley. 2010. “Ggplot2: Elegant Graphics for Data Analysis.” Journal of Statistical Software 35 (1).
Wilkinson, Leland. 2005. The Grammar of Graphics. 2nd ed. Springer-Verlag New York.