Today I learnt in class: How to use different R packages to visualize text using different methods such as tag cloud and word tree.
packages = c('tidyverse','DT',
# Handling time variables
'lubridate','hms',
# Text analytics
'tidytext','tidyverse','widyr','wordcloud',
'ggwordcloud','textplot','igraph','ggraph')
for (p in packages) {
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
data_2001 <- read_csv("data/csv-2001-2131.csv")
# Create a folder list
news20 <- "data/20news/"
# Define function to read all files from a folder into a data frame
read_folder <- function(infolder) {
tibble(file = dir(infolder,
full.names = TRUE)) %>%
mutate(text = map(file,
read_lines)) %>%
transmute(id = basename(file),
text) %>%
unnest(text)
}
# Read in all the messages from the 20news folder
raw_text <- tibble(folder =
dir(news20,
full.names = TRUE)) %>%
# map is part of the tidy R package
mutate(folder_out = map(folder,
read_folder)) %>%
unnest(cols = c(folder_out)) %>%
transmute(newsgroup = basename(folder),
id, text)
write_rds(raw_text,"data/rds/news20.rds")
raw_text %>%
group_by(newsgroup) %>%
summarize(messages = n_distinct(id)) %>%
ggplot(aes(messages, newsgroup)) +
geom_col(fill = "lightblue") +
labs(y = NULL)

cleaned_text <- raw_text %>%
group_by(newsgroup, id) %>%
# Refer to regular expressions to indicate patterns
filter(cumsum(text == "") > 0,
cumsum(str_detect(text, "^--")) == 0) %>%
ungroup()
cleaned_text <- cleaned_text %>%
filter(str_detect(text, "^[^>]+[A-Za-z\\d]")
| text == "",
!str_detect(text,
"writes(:|\\.\\.\\.)$"),
!str_detect(text,
"^In article <")
)
# Split a column into tokens using unnest_tokens
# Remove stop-words e.g.“the”, “of”, “to” that are not useful for analysis
usenet_words <- cleaned_text %>%
unnest_tokens(word,text) %>%
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)
usenet_words %>%
count(word, sort = TRUE)
# A tibble: 5,542 x 2
word n
<chr> <int>
1 people 57
2 time 50
3 jesus 47
4 god 44
5 message 40
6 br 27
7 bible 23
8 drive 23
9 homosexual 23
10 read 22
# ... with 5,532 more rows
words_by_newsgroup <- usenet_words %>%
count(newsgroup, word, sort = TRUE) %>%
ungroup()
# Form wordcloud
wordcloud(words_by_newsgroup$word,
words_by_newsgroup$n,
max.words = 300)

# Form wordcloud
set.seed(1234)
words_by_newsgroup %>%
filter(n > 0) %>%
ggplot(aes(label = word,
size = n)) +
geom_text_wordcloud() +
theme_minimal() +
facet_wrap(~newsgroup)

# Computing tf-idf within newsgroup
tf_idf <- words_by_newsgroup %>%
bind_tf_idf(word, newsgroup, n) %>%
arrange(desc(tf_idf))
# Create an interactive table created using datatable()
DT::datatable(tf_idf, filter= 'top') %>%
formatRound(columns = c('tf','idf','tf_idf'),
digits = 3) %>%
formatStyle(0,
target = 'row',
lineHeight = '25%')
# Facet bar charts technique is used to visualize the tf-idf values of science related newsgroup
tf_idf %>% filter(str_detect(newsgroup,"^sci\\.")) %>%
group_by(newsgroup) %>%
slice_max(tf_idf,
n = 12) %>%
ungroup() %>%
mutate(word = reorder(word,
tf_idf)) %>%
ggplot(aes(tf_idf,
word,
fill = newsgroup)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ newsgroup,
scales = "free") +
labs(x = "tf-idf",
y = NULL)

newsgroup_cors <- words_by_newsgroup %>%
pairwise_cor(newsgroup,
word,
n,
sort = TRUE)
set.seed(2017)
# Visualize the relationship between newsgroups in network graph
newsgroup_cors %>%
filter(correlation > 0.025) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha = correlation,
width = correlation)) +
geom_node_point(size = 6,
color = "lightblue") +
geom_node_text(aes(label = name),
color = "red",
# make sure that the labels do not overlap
repel = TRUE) +
theme_void()

# Bigram data frame created using unnest_tokens()
bigrams <- cleaned_text %>%
unnest_tokens(bigram,
text,
token = "ngrams",
n = 2)
bigrams_separated <- bigrams %>%
filter(bigrams != 'NA') %>%
separate(bigram,
c("word1","word2"),
sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
# Network graph created using graph_from_data_frame() of igraph
bigram_graph <- bigram_counts %>%
filter(n >3) %>%
graph_from_data_frame()
bigram_graph
IGRAPH c2de036 DN-- 41 25 --
+ attr: name (v/c), n (e/n)
+ edges from c2de036 (vertex names):
[1] NA ->NA 1 ->2
[3] 1 ->3 static ->void
[5] time ->pad 1 ->4
[7] infield ->fly mat ->28
[9] vv ->vv 1 ->5
[11] cock ->crow noticeshell->widget
[13] 27 ->1993 3 ->4
[15] child ->molestation cock ->crew
+ ... omitted several edges
set.seed(1234)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name),
vjust = 1,
hjust = 1)

set.seed(1234)
# Create a closed arrow to show direction of word relationships
a <- grid::arrow(type = "closed",
length = unit(.15,
"inches"))
ggraph(bigram_graph,
layout = "fr") +
geom_edge_link(aes(edge_alpha = n),
show.legend = FALSE,
arrow = a,
end_cap = circle(0.07,
'inches')) +
geom_node_point(color = "lightblue",
size = 5) +
geom_node_text(aes(label = name),
vjust = 1,
hjust = 1) +
theme_void()
