Today I learnt in class: How to use graph analytics to represent relationships and finding connected components.
packages = c('tidyverse',
# Handling time variables
'lubridate','clock',
# Network graphing
'network','igraph','tidygraph','ggraph','visNetwork')
for (p in packages) {
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
cc_data <- read_csv("data/cc_data.csv")
GAStech_edges <- read_csv("data/GAStech_email_edge-v2.csv")
GAStech_nodes <- read_csv("data/GAStech_email_node.csv")
glimpse(GAStech_edges)
Rows: 9,063
Columns: 8
$ source <dbl> 43, 43, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, ~
$ target <dbl> 41, 40, 51, 52, 53, 45, 44, 46, 48, 49, 47, 54, ~
$ SentDate <chr> "6/1/2014", "6/1/2014", "6/1/2014", "6/1/2014", ~
$ SentTime <time> 08:39:00, 08:39:00, 08:58:00, 08:58:00, 08:58:0~
$ Subject <chr> "GT-SeismicProcessorPro Bug Report", "GT-Seismic~
$ MainSubject <chr> "Work related", "Work related", "Work related", ~
$ sourceLabel <chr> "Sven.Flecha", "Sven.Flecha", "Kanon.Herrero", "~
$ targetLabel <chr> "Isak.Baza", "Lucas.Alcazar", "Felix.Resumir", "~
glimpse(GAStech_nodes)
Rows: 54
Columns: 4
$ id <dbl> 1, 2, 3, 4, 5, 6, 7, 44, 45, 46, 8, 9, 10, 11, 12~
$ label <chr> "Mat.Bramar", "Anda.Ribera", "Rachel.Pantanal", "~
$ Department <chr> "Administration", "Administration", "Administrati~
$ Title <chr> "Assistant to CEO", "Assistant to CFO", "Assistan~
SentDate is treated as “Character” data type instead of date type, which needs to be corrected.
GAStech_edges$SentDate = lubridate::dmy(GAStech_edges$SentDate)
GAStech_edges$Weekday = lubridate::wday(GAStech_edges$SentDate,
label = TRUE,
abbr = FALSE)
glimpse(GAStech_edges)
Rows: 9,063
Columns: 9
$ source <dbl> 43, 43, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, ~
$ target <dbl> 41, 40, 51, 52, 53, 45, 44, 46, 48, 49, 47, 54, ~
$ SentDate <date> 2014-01-06, 2014-01-06, 2014-01-06, 2014-01-06,~
$ SentTime <time> 08:39:00, 08:39:00, 08:58:00, 08:58:00, 08:58:0~
$ Subject <chr> "GT-SeismicProcessorPro Bug Report", "GT-Seismic~
$ MainSubject <chr> "Work related", "Work related", "Work related", ~
$ sourceLabel <chr> "Sven.Flecha", "Sven.Flecha", "Kanon.Herrero", "~
$ targetLabel <chr> "Isak.Baza", "Lucas.Alcazar", "Felix.Resumir", "~
$ Weekday <ord> Monday, Monday, Monday, Monday, Monday, Monday, ~
GAStech_edges_aggregated <- GAStech_edges %>%
filter(MainSubject == "Work related") %>%
group_by(source, target, Weekday) %>%
summarise(Weight = n()) %>%
filter(source != target) %>%
filter(Weight > 1) %>%
# Any time we do a group_by, we should do an ungroup
ungroup()
glimpse(GAStech_edges_aggregated)
Rows: 1,456
Columns: 4
$ source <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
$ target <dbl> 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6~
$ Weekday <ord> Monday, Tuesday, Wednesday, Friday, Monday, Tuesday,~
$ Weight <int> 4, 3, 5, 8, 4, 3, 5, 8, 4, 3, 5, 8, 4, 3, 5, 8, 4, 3~
GAStech_graph <- tbl_graph(nodes = GAStech_nodes,
edges = GAStech_edges_aggregated,
directed = TRUE)
GAStech_graph
# A tbl_graph: 54 nodes and 1456 edges
#
# A directed multigraph with 1 component
#
# Node Data: 54 x 4 (active)
id label Department Title
<dbl> <chr> <chr> <chr>
1 1 Mat.Bramar Administration Assistant to CEO
2 2 Anda.Ribera Administration Assistant to CFO
3 3 Rachel.Pantanal Administration Assistant to CIO
4 4 Linda.Lagos Administration Assistant to COO
5 5 Ruscella.Mies.H~ Administration Assistant to Engineering Grou~
6 6 Carla.Forluniau Administration Assistant to IT Group Manager
# ... with 48 more rows
#
# Edge Data: 1,456 x 4
from to Weekday Weight
<int> <int> <ord> <int>
1 1 2 Monday 4
2 1 2 Tuesday 3
3 1 2 Wednesday 5
# ... with 1,453 more rows
# using the Fruchterman and Reingold layout or "Fr"
g <- ggraph(GAStech_graph,
layout = "fr") +
geom_edge_link(aes(colour = 'grey80')) +
geom_node_point(aes(colour = 'grey40', size = 3))
g + theme_graph(background = 'grey10',
text_colour = 'white')

# using the nicely layout
g <- ggraph(GAStech_graph,
layout = "nicely") +
geom_edge_link(aes()) +
geom_node_point(aes(colour = Department, size = 3))
g + theme_graph()

# using the nicely layout
g <- ggraph(GAStech_graph,
layout = "nicely") +
geom_edge_link(aes(width = Weight), alpha = 0.2) +
scale_edge_width(range = c(0.1,5)) +
geom_node_point(aes(colour = Department, size = 3))
g + theme_graph()

# using the nicely layout
set_graph_style()
g <- ggraph(GAStech_graph,
layout = "nicely") +
geom_edge_link(aes(width = Weight), alpha = 0.2) +
scale_edge_width(range = c(0.1,5)) +
geom_node_point(aes(colour = Department, size = 0.5, alpha = 0.5))
g + facet_edges(~Weekday) +
th_foreground(foreground = "grey80",
border = TRUE) +
theme(legend.position = 'bottom')

# using the nicely layout
set_graph_style()
g <- ggraph(GAStech_graph,
layout = "nicely") +
geom_edge_link(aes(width = Weight), alpha = 0.2) +
scale_edge_width(range = c(0.1,5)) +
geom_node_point(aes(colour = Department, size = 0.5, alpha = 0.5))
g + facet_nodes(~Department) +
th_foreground(foreground = "grey80",
border = TRUE) +
theme(legend.position = 'bottom')

g <- GAStech_graph %>%
# Calculate the centrality based on the betweenness of nodes
# But this calculation is not necessary for ggraph 2.0 just need to indicate centrality_betweenness() under size
mutate(betweenness_centrality = centrality_betweenness()) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(width=Weight),alpha = 0.2) +
scale_edge_width(range = c(0.1,5)) +
# Increase the size of nodes between on centrality
geom_node_point(aes(colour = Department,
size = betweenness_centrality))
g

g <- GAStech_graph %>%
ggraph(layout = "fr") +
geom_edge_link(aes(width=Weight),alpha = 0.2) +
scale_edge_width(range = c(0.1,5)) +
# Increase the size of nodes between on centrality
geom_node_point(aes(colour = Department,
size = centrality_betweenness()))
g

g <- GAStech_graph %>%
# Make the betweenness as a categorical variable
mutate(community = as.factor(group_edge_betweenness(weights = Weight,
directed = TRUE))) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(width=Weight),alpha = 0.2) +
scale_edge_width(range = c(0.1,5)) +
# Colour using the community value
geom_node_point(aes(colour = community))
g

We can also use other detection algorithms to visualize community (besides group_edge_betweenness), including group_leading_eigen, group_fast_greedy, group_louvain, group_walktrap, group_label_prop, group_infomap, group_spinglass, and group_optimal.
GAStech_edges_aggregated <- GAStech_edges %>%
left_join(GAStech_nodes, by = c("sourceLabel" = "label")) %>%
rename(from = id) %>%
left_join(GAStech_nodes, by = c("targetLabel" = "label")) %>%
rename(to = id) %>%
filter(MainSubject == "Work related") %>%
group_by(from, to) %>%
summarize(weight = n()) %>%
filter(from!= to) %>%
filter(weight > 1) %>%
ungroup()
GAStech_nodes <- GAStech_nodes %>%
rename(group = Department)
visNetwork(GAStech_nodes,
GAStech_edges_aggregated, width = "100%") %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visLegend() %>%
visLayout(randomSeed = 123) %>%
visOptions(highlightNearest = TRUE,
nodesIdSelection = TRUE)