While boxplots have become the de facto standard for plotting the distribution of data this is a vast oversimplification and may not show everything needed to evaluate the variation of data. This is particularly important for datasets which do not form a Gaussian “Normal” distribution that most researchers have become accustomed to.
While density plots are helpful in this regard, they can be less aesthetically pleasing than boxplots and harder to interpret for those familiar with boxplots. Often the only ways to compare multiple data types with density use slices of the data with faceting the plotting panes or overlaying density curves with colours and a legend. This approach is jarring for new users and leads to cluttered plots difficult to present to a wider audience.
Therefore violin plots are a powerful tool to assist researchers to visualise data, particularly in the quality checking and exploratory parts of an analysis. Violin plots have many benefits:
As shown below for the iris
dataset, violin plots show
distribution information that the boxplot is unable to.
data(iris)
boxplot(iris$Sepal.Length[iris$Species=="setosa"], iris$Sepal.Length[iris$Species=="versicolor"], iris$Sepal.Length[iris$Species=="virginica"], names=c("setosa", "versicolor", "virginica"))
library("vioplot")
vioplot(iris$Sepal.Length[iris$Species=="setosa"], iris$Sepal.Length[iris$Species=="versicolor"], iris$Sepal.Length[iris$Species=="virginica"], names=c("setosa", "versicolor", "virginica"))
However as we can see here the plot defaults are not aesthetically pleasing, with a rather glaring colour scheme unsuitable for professional or academic usage. Thus the plot default colours have been changed as shown here:
Plot colours can be further customised as with the original vioplot
package using the col
argument:
vioplot(iris$Sepal.Length[iris$Species=="setosa"], iris$Sepal.Length[iris$Species=="versicolor"], iris$Sepal.Length[iris$Species=="virginica"], names=c("setosa", "versicolor", "virginica"), main = "Sepal Length", col="lightblue")
However the vioplot
(0.2) function is unable to colour
each violin separately, thus this is enabled with a vectorised
col
in vioplot
(0.3):
vioplot(iris$Sepal.Length[iris$Species=="setosa"], iris$Sepal.Length[iris$Species=="versicolor"], iris$Sepal.Length[iris$Species=="virginica"], names=c("setosa", "versicolor", "virginica"), main = "Sepal Length", col=c("lightgreen", "lightblue", "palevioletred"))
legend("topleft", legend=c("setosa", "versicolor", "virginica"), fill=c("lightgreen", "lightblue", "palevioletred"), cex = 0.5)
Colours can also be customised for the violin fill and border
separately using the col
and border
arguments:
vioplot(iris$Sepal.Length[iris$Species=="setosa"], iris$Sepal.Length[iris$Species=="versicolor"], iris$Sepal.Length[iris$Species=="virginica"], names=c("setosa", "versicolor", "virginica"), main = "Sepal Length", col="lightblue", border="royalblue")
Similarly, the arguments lineCol
and
rectCol
specify the colors of the boxplot outline and
rectangle fill. For simplicity the box and whiskers of the boxplot will
always have the same colour.
vioplot(iris$Sepal.Length[iris$Species=="setosa"], iris$Sepal.Length[iris$Species=="versicolor"], iris$Sepal.Length[iris$Species=="virginica"], names=c("setosa", "versicolor", "virginica"), main = "Sepal Length", rectCol="palevioletred", lineCol="violetred")
The same applies to the colour of the median point with
colMed
:
vioplot(iris$Sepal.Length[iris$Species=="setosa"], iris$Sepal.Length[iris$Species=="versicolor"], iris$Sepal.Length[iris$Species=="virginica"], names=c("setosa", "versicolor", "virginica"), main = "Sepal Length", colMed="violet")
### Combined customisation
These can be customised colours can be combined:
vioplot(iris$Sepal.Length[iris$Species=="setosa"], iris$Sepal.Length[iris$Species=="versicolor"], iris$Sepal.Length[iris$Species=="virginica"], names=c("setosa", "versicolor", "virginica"), main = "Sepal Length", col="lightblue", border="royalblue", rectCol="palevioletred", lineCol="violetred", colMed="violet")
These color and shape settings can also be customised separately for each violin:
vioplot(iris$Sepal.Length[iris$Species=="setosa"], iris$Sepal.Length[iris$Species=="versicolor"], iris$Sepal.Length[iris$Species=="virginica"], names=c("setosa", "versicolor", "virginica"), main="Sepal Length (Equal Area)", areaEqual = T, col=c("lightgreen", "lightblue", "palevioletred"), border=c("darkolivegreen4", "royalblue4", "violetred4"), rectCol=c("forestgreen", "blue", "palevioletred3"), lineCol=c("darkolivegreen", "royalblue", "violetred4"), colMed=c("green", "cyan", "magenta"), pchMed=c(15, 17, 19))
This should be sufficient to customise the violin plot but further examples are given in the areaEqual vioplot vignette including how violin plots are useful for comparing variation when data does not follow the same distribution. This document also compares the violin plot with other established methods to plot data variation.
Here we demonstrate additional annotation features to display outliers and group sizes.
Note that y-axes limits need to be adjusted to avoid overlaying text.
data("iris")
attach(iris)
vioplot(iris$Sepal.Length[iris$Species=="setosa"], iris$Sepal.Length[iris$Species=="versicolor"], iris$Sepal.Length[iris$Species=="virginica"], main = "Sepal Length", ylab = "",
col=c("lightgreen", "lightblue", "palevioletred"), ylim = c(0, max(Sepal.Length) * 1.1))
legend("bottomright", legend=c("setosa", "versicolor", "virginica"),
fill=c("lightgreen", "lightblue", "palevioletred"), cex = 0.8)
add_labels(unlist(iris$Sepal.Length), iris$Species, height = 0.5, cex = 0.8)
#### Plotting outliers and medians
Here we add outliers and show annotation features.
# add outliers to demo data
iris2 <- iris
iris2 <- rbind(iris2, c(7, 1, 0, 0, "setosa"))
iris2 <- rbind(iris2, c(1, 10, 0, 0, "setosa"))
iris2 <- rbind(iris2, c(9, 2, 0, 0, "versicolor"))
iris2 <- rbind(iris2, c(2, 12, 0, 0, "versicolor"))
iris2 <- rbind(iris2, c(10, 1, 0, 0, "virginica"))
iris2 <- rbind(iris2, c(12, 7, 0, 0, "virginica"))
iris2$Species <- factor(iris2$Species)
iris2$Sepal.Length <- as.numeric(iris2$Sepal.Length)
iris2$Sepal.Width <- as.numeric(iris2$Sepal.Width)
table(iris2$Species)
##
## setosa versicolor virginica
## 52 52 52
This adds outliers to the plot.
## The following objects are masked from iris:
##
## Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species
vioplot(iris2$Sepal.Length[iris$Species=="setosa"], iris2$Sepal.Length[iris$Species=="versicolor"], iris2$Sepal.Length[iris2$Species=="virginica"], main = "Sepal Length",
col=c("lightgreen", "lightblue", "palevioletred"), ylim = c(min(Sepal.Length) * 0.9, max(Sepal.Length) * 1.1),
names=c("setosa", "versicolor", "virginica"))
Sepal.medians <- sapply(unique(Species), function(sp) median(Sepal.Length[Species == sp]))
# highlights medians
points(x = c(1:length(Sepal.medians)), y = Sepal.medians, pch = 21, cex = 1.25, lwd = 2,
col = "white", bg = c("forestgreen", "lightblue4", "palevioletred4"))
# plots outliers above 2 SD
add_outliers(unlist(iris2$Sepal.Length), iris2$Species, cutoff = 2,
col = "black", bars = "grey85", lwd = 2,
fill = c("palegreen3", "lightblue3", "palevioletred3"))
legend("bottomright", legend=c("setosa", "versicolor", "virginica"),
fill=c("lightgreen", "lightblue", "palevioletred"), cex = 0.6)
add_labels(unlist(iris2$Sepal.Length), iris2$Species, height = 0.5, cex = 0.8)
Annotation on split violins are shown here. See the split violin plot vignette for details on these parameters.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.800 3.000 3.151 3.400 12.000
##
## FALSE TRUE
## 97 59
iris_large <- iris2[iris2$Sepal.Width > mean(iris2$Sepal.Width), ]
iris_small <- iris2[iris2$Sepal.Width <= mean(iris2$Sepal.Width), ]
attach(iris_large)
## The following objects are masked from iris2:
##
## Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species
## The following objects are masked from iris:
##
## Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species
vioplot(iris_large$Sepal.Length[iris_large$Species=="setosa"], iris_large$Sepal.Length[iris_large$Species=="versicolor"], iris_large$Sepal.Length[iris_large$Species=="virginica"], plotCentre = "line", side = "right", col=c("lightgreen", "lightblue", "palevioletred"), ylim = c(min(iris2$Sepal.Length) * 0.9, max(iris2$Sepal.Length) * 1.1),
names=c("setosa", "versicolor", "virginica"))
Sepal.medians <- sapply(unique(Species), function(sp) median(iris_large$Sepal.Length[Species == sp]))
# highlights medians
points(x = c(1:length(Sepal.medians)), y = Sepal.medians, pch = 21, cex = 1.25, lwd = 2,
col = "white", bg = c("forestgreen", "lightblue4", "palevioletred4"))
# plots outliers above 2 SD
add_outliers(unlist(iris_large$Sepal.Length), iris2$Species, cutoff = 2,
col = c("palegreen3", "lightblue3", "palevioletred3"), bars = "grey85", lwd = 2,
fill = "grey85")
legend("bottomright", legend=c("setosa", "versicolor", "virginica"),
fill=c("lightgreen", "lightblue", "palevioletred"), cex = 0.6)
add_labels(unlist(iris2$Sepal.Length), iris2$Species, height = 0.5, cex = 0.8)
attach(iris_small)
## The following objects are masked from iris_large:
##
## Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species
## The following objects are masked from iris2:
##
## Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species
## The following objects are masked from iris:
##
## Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species
vioplot(iris_small$Sepal.Length[iris_small$Species=="setosa"], iris_small$Sepal.Length[iris_small$Species=="versicolor"], iris_small$Sepal.Length[iris_small$Species=="virginica"], plotCentre = "line", side = "left", add = T, col=c("palegreen1", "lightblue1", "palevioletred1"), ylim = c(min(Sepal.Length) * 0.9, max(Sepal.Length) * 1.1),
names=c("setosa", "versicolor", "virginica"))
## Warning in vioplot.default(iris_small$Sepal.Length[iris_small$Species == : Warning: names can only be changed on first call of vioplot (when add = FALSE)
Sepal.medians <- sapply(unique(Species), function(sp) median(iris_small$Sepal.Length[Species == sp]))
# highlights medians
points(x = c(1:length(Sepal.medians)), y = Sepal.medians, pch = 21, cex = 1.25, lwd = 2,
col = "white", bg = c("forestgreen", "lightblue4", "palevioletred4"))
# plots outliers above 2 SD
add_outliers(unlist(iris2$Sepal.Length), iris2$Species, cutoff = 2,
col = c("palegreen3", "lightblue3", "palevioletred3"), bars = "grey85", lwd = 2,
fill = "grey50")
legend("bottomright", legend=c("setosa", "versicolor", "virginica"),
fill=c("lightgreen", "lightblue", "palevioletred"), cex = 0.6)
add_labels(unlist(iris2$Sepal.Length), iris2$Species, height = 0.5, cex = 0.8)
# add legend and titles
legend("topleft", fill = c("lightblue2", "lightblue3"), legend = c("small", "large"), title = "Sepal Width")
title(xlab = "Species", ylab = "Sepal Length")