TIL: Week 10 In-Class Exercise

Today I learnt in class: How to use graph analytics to represent relationships and finding connected components.

Kelly Koh https://www.linkedin.com/in/kellykkw/ (School of Computing and Information Systems, Singapore Management University)https://scis.smu.edu.sg/
07-10-2021

Install packages and loading data

Install the necessary packages if they are not in library

packages = c('tidyverse',
             # Handling time variables
             'lubridate','clock',
             # Network graphing
             'network','igraph','tidygraph','ggraph','visNetwork')

for (p in packages) {
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Data cleaning

Import data from csv and preview data

cc_data <- read_csv("data/cc_data.csv")
GAStech_edges <- read_csv("data/GAStech_email_edge-v2.csv")
GAStech_nodes <- read_csv("data/GAStech_email_node.csv")

glimpse(GAStech_edges)
Rows: 9,063
Columns: 8
$ source      <dbl> 43, 43, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, ~
$ target      <dbl> 41, 40, 51, 52, 53, 45, 44, 46, 48, 49, 47, 54, ~
$ SentDate    <chr> "6/1/2014", "6/1/2014", "6/1/2014", "6/1/2014", ~
$ SentTime    <time> 08:39:00, 08:39:00, 08:58:00, 08:58:00, 08:58:0~
$ Subject     <chr> "GT-SeismicProcessorPro Bug Report", "GT-Seismic~
$ MainSubject <chr> "Work related", "Work related", "Work related", ~
$ sourceLabel <chr> "Sven.Flecha", "Sven.Flecha", "Kanon.Herrero", "~
$ targetLabel <chr> "Isak.Baza", "Lucas.Alcazar", "Felix.Resumir", "~
glimpse(GAStech_nodes)
Rows: 54
Columns: 4
$ id         <dbl> 1, 2, 3, 4, 5, 6, 7, 44, 45, 46, 8, 9, 10, 11, 12~
$ label      <chr> "Mat.Bramar", "Anda.Ribera", "Rachel.Pantanal", "~
$ Department <chr> "Administration", "Administration", "Administrati~
$ Title      <chr> "Assistant to CEO", "Assistant to CFO", "Assistan~

SentDate is treated as “Character” data type instead of date type, which needs to be corrected.

Change the format type of SentDate and add a new Weekday field

GAStech_edges$SentDate = lubridate::dmy(GAStech_edges$SentDate)
GAStech_edges$Weekday = lubridate::wday(GAStech_edges$SentDate,
                             label = TRUE,
                             abbr = FALSE)

glimpse(GAStech_edges)
Rows: 9,063
Columns: 9
$ source      <dbl> 43, 43, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, ~
$ target      <dbl> 41, 40, 51, 52, 53, 45, 44, 46, 48, 49, 47, 54, ~
$ SentDate    <date> 2014-01-06, 2014-01-06, 2014-01-06, 2014-01-06,~
$ SentTime    <time> 08:39:00, 08:39:00, 08:58:00, 08:58:00, 08:58:0~
$ Subject     <chr> "GT-SeismicProcessorPro Bug Report", "GT-Seismic~
$ MainSubject <chr> "Work related", "Work related", "Work related", ~
$ sourceLabel <chr> "Sven.Flecha", "Sven.Flecha", "Kanon.Herrero", "~
$ targetLabel <chr> "Isak.Baza", "Lucas.Alcazar", "Felix.Resumir", "~
$ Weekday     <ord> Monday, Monday, Monday, Monday, Monday, Monday, ~

Summarize the data to find the weight of the relationships

GAStech_edges_aggregated <- GAStech_edges %>%
                            filter(MainSubject == "Work related")  %>%
                            group_by(source, target, Weekday) %>%
                            summarise(Weight = n())  %>%
                            filter(source != target)  %>%
                            filter(Weight > 1)  %>%
                            # Any time we do a group_by, we should do an ungroup
                            ungroup()
glimpse(GAStech_edges_aggregated)
Rows: 1,456
Columns: 4
$ source  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
$ target  <dbl> 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6~
$ Weekday <ord> Monday, Tuesday, Wednesday, Friday, Monday, Tuesday,~
$ Weight  <int> 4, 3, 5, 8, 4, 3, 5, 8, 4, 3, 5, 8, 4, 3, 5, 8, 4, 3~

Using tbl_graph to build tidygraph data model

GAStech_graph <- tbl_graph(nodes = GAStech_nodes,
                          edges = GAStech_edges_aggregated,
                          directed = TRUE)
GAStech_graph
# A tbl_graph: 54 nodes and 1456 edges
#
# A directed multigraph with 1 component
#
# Node Data: 54 x 4 (active)
     id label            Department     Title                         
  <dbl> <chr>            <chr>          <chr>                         
1     1 Mat.Bramar       Administration Assistant to CEO              
2     2 Anda.Ribera      Administration Assistant to CFO              
3     3 Rachel.Pantanal  Administration Assistant to CIO              
4     4 Linda.Lagos      Administration Assistant to COO              
5     5 Ruscella.Mies.H~ Administration Assistant to Engineering Grou~
6     6 Carla.Forluniau  Administration Assistant to IT Group Manager 
# ... with 48 more rows
#
# Edge Data: 1,456 x 4
   from    to Weekday   Weight
  <int> <int> <ord>      <int>
1     1     2 Monday         4
2     1     2 Tuesday        3
3     1     2 Wednesday      5
# ... with 1,453 more rows

Visualizing graph data

Visualize the data using ggraph which is wrapped over tidygraph

# using the Fruchterman and Reingold layout or "Fr"
g <- ggraph(GAStech_graph,
            layout = "fr") + 
  geom_edge_link(aes(colour = 'grey80')) + 
  geom_node_point(aes(colour = 'grey40', size = 3)) 

g + theme_graph(background = 'grey10',
                text_colour = 'white')

Modifying the node by referring to respective departments

# using the nicely layout 
g <- ggraph(GAStech_graph,
            layout = "nicely") + 
  geom_edge_link(aes()) + 
  geom_node_point(aes(colour = Department, size = 3)) 

g + theme_graph()

Modifying the node by referring to respective departments

# using the nicely layout 
g <- ggraph(GAStech_graph,
            layout = "nicely") + 
  geom_edge_link(aes(width = Weight), alpha = 0.2) +
  scale_edge_width(range = c(0.1,5)) +
  geom_node_point(aes(colour = Department, size = 3)) 

g + theme_graph()

Working with facet_edges()

# using the nicely layout 
set_graph_style()
g <- ggraph(GAStech_graph,
            layout = "nicely") + 
  geom_edge_link(aes(width = Weight), alpha = 0.2) +
  scale_edge_width(range = c(0.1,5)) +
  geom_node_point(aes(colour = Department, size = 0.5, alpha = 0.5)) 

g + facet_edges(~Weekday) +
  th_foreground(foreground = "grey80",
                border = TRUE) +
  theme(legend.position = 'bottom')

Working with facet_nodes()

# using the nicely layout 
set_graph_style()
g <- ggraph(GAStech_graph,
            layout = "nicely") + 
  geom_edge_link(aes(width = Weight), alpha = 0.2) +
  scale_edge_width(range = c(0.1,5)) +
  geom_node_point(aes(colour = Department, size = 0.5, alpha = 0.5))  

g + facet_nodes(~Department) +
  th_foreground(foreground = "grey80",
                border = TRUE) +
  theme(legend.position = 'bottom')

Computing centrality indices

g <- GAStech_graph %>%
    # Calculate the centrality based on the betweenness of nodes 
    # But this calculation is not necessary for ggraph 2.0 just need to indicate centrality_betweenness() under size
    mutate(betweenness_centrality = centrality_betweenness()) %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(width=Weight),alpha = 0.2) +
    scale_edge_width(range = c(0.1,5)) +
    # Increase the size of nodes between on centrality
    geom_node_point(aes(colour = Department,
                      size = betweenness_centrality))
g

Computing centrality indices

g <- GAStech_graph %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(width=Weight),alpha = 0.2) +
    scale_edge_width(range = c(0.1,5)) +
    # Increase the size of nodes between on centrality
    geom_node_point(aes(colour = Department,
                      size = centrality_betweenness()))
g

Visualizing community

g <- GAStech_graph %>%
    # Make the betweenness as a categorical variable
    mutate(community = as.factor(group_edge_betweenness(weights = Weight, 
                                                        directed = TRUE))) %>%
    ggraph(layout = "fr") +
    
    geom_edge_link(aes(width=Weight),alpha = 0.2) +
    scale_edge_width(range = c(0.1,5)) +
    # Colour using the community value
    geom_node_point(aes(colour = community))
g

We can also use other detection algorithms to visualize community (besides group_edge_betweenness), including group_leading_eigen, group_fast_greedy, group_louvain, group_walktrap, group_label_prop, group_infomap, group_spinglass, and group_optimal.

Data prep and plot interactive network graph using visNetwork

GAStech_edges_aggregated <- GAStech_edges %>%
  left_join(GAStech_nodes, by = c("sourceLabel" = "label")) %>%
  rename(from = id) %>%
  left_join(GAStech_nodes, by = c("targetLabel" = "label")) %>%
  rename(to = id) %>%
  filter(MainSubject == "Work related") %>%
  group_by(from, to) %>%
  summarize(weight = n()) %>%
  filter(from!= to) %>%
  filter(weight > 1) %>%
  ungroup()

GAStech_nodes <- GAStech_nodes %>%
  rename(group = Department)

visNetwork(GAStech_nodes,
           GAStech_edges_aggregated, width = "100%") %>%
  visIgraphLayout(layout = "layout_with_fr") %>%
  visLegend() %>%
  visLayout(randomSeed = 123) %>%
  visOptions(highlightNearest = TRUE,
             nodesIdSelection = TRUE)