TIL: Week 8 In-Class Exercise

Today I learnt in class: We start learning about interactive visualization using R. Tidyverse will be used to create the data science workflow.

Kelly Koh https://www.linkedin.com/in/kellykkw/ (School of Computing and Information Systems, Singapore Management University)https://scis.smu.edu.sg/
06-26-2021

Install packages and loading data

Install the necessary packages if they are not in library

packages = c('DT','ggiraph','plotly','tidyverse')

for (p in packages) {
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Import data from csv and preview data

exam_data <- read_csv("data/Exam_data.csv")

head(exam_data,10)
# A tibble: 10 x 7
   ID         CLASS GENDER RACE    ENGLISH MATHS SCIENCE
   <chr>      <chr> <chr>  <chr>     <dbl> <dbl>   <dbl>
 1 Student321 3I    Male   Malay        21     9      15
 2 Student305 3I    Female Malay        24    22      16
 3 Student289 3H    Male   Chinese      26    16      16
 4 Student227 3F    Male   Chinese      27    77      31
 5 Student318 3I    Male   Malay        27    11      25
 6 Student306 3I    Female Malay        31    16      16
 7 Student313 3I    Male   Chinese      31    21      25
 8 Student316 3I    Male   Malay        31    18      27
 9 Student312 3I    Male   Malay        33    19      15
10 Student297 3H    Male   Indian       34    49      37
summary(exam_data)
      ID               CLASS              GENDER         
 Length:322         Length:322         Length:322        
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
     RACE              ENGLISH          MATHS          SCIENCE     
 Length:322         Min.   :21.00   Min.   : 9.00   Min.   :15.00  
 Class :character   1st Qu.:59.00   1st Qu.:58.00   1st Qu.:49.25  
 Mode  :character   Median :70.00   Median :74.00   Median :65.00  
                    Mean   :67.18   Mean   :69.33   Mean   :61.16  
                    3rd Qu.:78.00   3rd Qu.:85.00   3rd Qu.:74.75  
                    Max.   :96.00   Max.   :99.00   Max.   :96.00  

The dataset is made up of 7 variables with 322 observations:

Basic charts using ggplot

Plot histogram of the Math scores

math_hist <- ggplot(data = exam_data, aes(x = MATHS)) +
  geom_histogram(bins = 20,
                 boundary = 50,
                 color = "black",
                 fill = "light blue"
                 ) +
  scale_x_continuous("MATHS", limits = c(0,100), breaks = seq(0,100,5)) +
  ggtitle("Distribution of Math Scores")

ggplotly(math_hist)

Plot histogram of the math scores by gender

math_hist2 <- ggplot(data = exam_data, aes(x = MATHS, fill = GENDER)) +
  geom_histogram(bins = 20,
                 boundary = 50,
                 color = "grey30",
                 position = "stack"
                 ) +
  ggtitle("Distribution of Math Scores by Gender")

ggplotly(math_hist2)

Plot bar chart of race

race_bar <- ggplot(data = exam_data, aes(x = RACE)) +
  geom_bar() +
  ggtitle("Distribution of Race")

ggplotly(race_bar)

Plot dotplot of math scores by race

math_dot <- ggplot(data = exam_data, 
                   aes(x = MATHS, fill = RACE)) +
  geom_dotplot(binwidth = 2.5,
               dotsize = 0.5,
               stackgroups = TRUE) +
  scale_y_continuous(NULL, 
                     breaks = NULL) +
  ggtitle("Distribution of Math Scores by Race")

math_dot

Plot boxplot of math scores by gender

math_box <- ggplot(data = exam_data, 
                   aes(y = MATHS, x = GENDER)) +
  geom_boxplot() +
  geom_point(position="jitter",
             size = 0.5) +
  ggtitle("Distribution of Math Scores by Gender")

ggplotly(math_box)

Interactivity ggplot using ggiraph

Plot interactive dotplot of math scores by gender using ggiraph tooltip

math_dot_interactive <- ggplot(data = exam_data, 
                   aes(x = MATHS, fill = RACE)) +
  geom_dotplot_interactive(
              aes(tooltip = ID),
              method = "histodot",
              binwidth = 2.5,
              dotsize = 0.5,
              stackgroups = TRUE) +
  scale_y_continuous(NULL, 
                     breaks = NULL) +
  ggtitle("Distribution of Math Scores by Race")

girafe(
  ggobj = math_dot_interactive,
  width_svg = 6,
  height_svg = 6*0.618
)

Plot interactive dotplot of math scores using ggiraph data_id

math_dot_interactive <- ggplot(data = exam_data, 
                   aes(x = MATHS)) +
  geom_dotplot_interactive(
              aes(data_id = CLASS, tooltip = CLASS),
              method = "histodot",
              binwidth = 1,
              stackgroups = TRUE) +
  scale_y_continuous(NULL, 
                     breaks = NULL) +
  ggtitle("Distribution of Math Scores by Class")

girafe(
  ggobj = math_dot_interactive,
  width_svg = 6,
  height_svg = 6*0.618
)