Today I learnt in class: We start learning about interactive visualization using R. Tidyverse will be used to create the data science workflow.
packages = c('DT','ggiraph','plotly','tidyverse')
for (p in packages) {
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
exam_data <- read_csv("data/Exam_data.csv")
head(exam_data,10)
# A tibble: 10 x 7
ID CLASS GENDER RACE ENGLISH MATHS SCIENCE
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 Student321 3I Male Malay 21 9 15
2 Student305 3I Female Malay 24 22 16
3 Student289 3H Male Chinese 26 16 16
4 Student227 3F Male Chinese 27 77 31
5 Student318 3I Male Malay 27 11 25
6 Student306 3I Female Malay 31 16 16
7 Student313 3I Male Chinese 31 21 25
8 Student316 3I Male Malay 31 18 27
9 Student312 3I Male Malay 33 19 15
10 Student297 3H Male Indian 34 49 37
summary(exam_data)
ID CLASS GENDER
Length:322 Length:322 Length:322
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
RACE ENGLISH MATHS SCIENCE
Length:322 Min. :21.00 Min. : 9.00 Min. :15.00
Class :character 1st Qu.:59.00 1st Qu.:58.00 1st Qu.:49.25
Mode :character Median :70.00 Median :74.00 Median :65.00
Mean :67.18 Mean :69.33 Mean :61.16
3rd Qu.:78.00 3rd Qu.:85.00 3rd Qu.:74.75
Max. :96.00 Max. :99.00 Max. :96.00
The dataset is made up of 7 variables with 322 observations:
ID is the unique key for each studentCLASS, GENDER, RACE are categorical variables expressed as characterENGLISH, MATHS, SCIENCE are continuous variables defined as numeric values with decimal pointsmath_hist2 <- ggplot(data = exam_data, aes(x = MATHS, fill = GENDER)) +
geom_histogram(bins = 20,
boundary = 50,
color = "grey30",
position = "stack"
) +
ggtitle("Distribution of Math Scores by Gender")
ggplotly(math_hist2)
race_bar <- ggplot(data = exam_data, aes(x = RACE)) +
geom_bar() +
ggtitle("Distribution of Race")
ggplotly(race_bar)
math_dot <- ggplot(data = exam_data,
aes(x = MATHS, fill = RACE)) +
geom_dotplot(binwidth = 2.5,
dotsize = 0.5,
stackgroups = TRUE) +
scale_y_continuous(NULL,
breaks = NULL) +
ggtitle("Distribution of Math Scores by Race")
math_dot

math_box <- ggplot(data = exam_data,
aes(y = MATHS, x = GENDER)) +
geom_boxplot() +
geom_point(position="jitter",
size = 0.5) +
ggtitle("Distribution of Math Scores by Gender")
ggplotly(math_box)
math_dot_interactive <- ggplot(data = exam_data,
aes(x = MATHS, fill = RACE)) +
geom_dotplot_interactive(
aes(tooltip = ID),
method = "histodot",
binwidth = 2.5,
dotsize = 0.5,
stackgroups = TRUE) +
scale_y_continuous(NULL,
breaks = NULL) +
ggtitle("Distribution of Math Scores by Race")
girafe(
ggobj = math_dot_interactive,
width_svg = 6,
height_svg = 6*0.618
)
math_dot_interactive <- ggplot(data = exam_data,
aes(x = MATHS)) +
geom_dotplot_interactive(
aes(data_id = CLASS, tooltip = CLASS),
method = "histodot",
binwidth = 1,
stackgroups = TRUE) +
scale_y_continuous(NULL,
breaks = NULL) +
ggtitle("Distribution of Math Scores by Class")
girafe(
ggobj = math_dot_interactive,
width_svg = 6,
height_svg = 6*0.618
)