Kelly's Blog: TIL: Week 10 In-Class Exercise 2

Install packages and loading data

Install the necessary packages if they are not in library

packages = c('tidyverse','DT',
             # Handling time variables
             'lubridate','hms',
             # Text analytics
             'tidytext','tidyverse','widyr','wordcloud',
             'ggwordcloud','textplot','igraph','ggraph')

for (p in packages) {
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Data cleaning

Import data from csv and preview data

data_2001 <- read_csv("data/csv-2001-2131.csv")

# Create a folder list
news20 <- "data/20news/"

# Define function to read all files from a folder into a data frame
read_folder <- function(infolder) {
    tibble(file = dir(infolder,
                      full.names = TRUE)) %>%
    mutate(text = map(file,
                      read_lines)) %>%
    transmute(id = basename(file),
                   text) %>%
    unnest(text)
}

# Read in all the messages from the 20news folder
raw_text <- tibble(folder =
                     dir(news20,
                         full.names = TRUE)) %>%
            # map is part of the tidy R package
            mutate(folder_out = map(folder,
                                    read_folder)) %>%
            unnest(cols = c(folder_out)) %>%
            transmute(newsgroup = basename(folder),
                      id, text)

write_rds(raw_text,"data/rds/news20.rds")

Checking if the data loaded is correct

raw_text %>%
  group_by(newsgroup) %>%
  summarize(messages = n_distinct(id)) %>%
  ggplot(aes(messages, newsgroup)) +
  geom_col(fill = "lightblue") +
  labs(y = NULL)

Cleaning text data

cleaned_text <- raw_text %>%
                group_by(newsgroup, id) %>%
                # Refer to regular expressions to indicate patterns
                filter(cumsum(text == "") > 0, 
                       cumsum(str_detect(text, "^--")) == 0) %>%
                ungroup()

cleaned_text <- cleaned_text %>%
                filter(str_detect(text, "^[^>]+[A-Za-z\\d]")
                       | text == "",
                       !str_detect(text,
                                   "writes(:|\\.\\.\\.)$"),
                       !str_detect(text,
                                   "^In article <")
                       )

Text Data Processing

Tokenization - extract words out of the string of text

# Split a column into tokens using unnest_tokens
# Remove stop-words e.g.“the”, “of”, “to” that are not useful for analysis
usenet_words <- cleaned_text %>%
                unnest_tokens(word,text) %>% 
                filter(str_detect(word, "[a-z']$"),
                       !word %in% stop_words$word)

usenet_words %>% 
  count(word, sort = TRUE)

# A tibble: 5,542 x 2
   word           n
   <chr>      <int>
 1 people        57
 2 time          50
 3 jesus         47
 4 god           44
 5 message       40
 6 br            27
 7 bible         23
 8 drive         23
 9 homosexual    23
10 read          22
# ... with 5,532 more rows

words_by_newsgroup <- usenet_words %>%
                       count(newsgroup, word, sort = TRUE) %>%
                       ungroup()

Visualizing text data

Visualize the data using word cloud

# Form wordcloud
wordcloud(words_by_newsgroup$word,
          words_by_newsgroup$n,
          max.words = 300)

Visualize the data using gg word cloud

# Form wordcloud
set.seed(1234)

words_by_newsgroup %>%
  filter(n > 0) %>%
  ggplot(aes(label = word,
             size = n)) +
  geom_text_wordcloud() +
  theme_minimal() +
  facet_wrap(~newsgroup)

TF-IDF - term frequency-inverse document frequency to reflect how important a word is to a document in a collection of corpus

# Computing tf-idf within newsgroup
tf_idf <- words_by_newsgroup %>%
          bind_tf_idf(word, newsgroup, n) %>%
          arrange(desc(tf_idf))

# Create an interactive table created using datatable()
DT::datatable(tf_idf, filter= 'top') %>%
  formatRound(columns = c('tf','idf','tf_idf'),
              digits = 3) %>% 
  formatStyle(0,
              target = 'row',
              lineHeight = '25%')

Visualizing tf_idf within newsgroups

# Facet bar charts technique is used to visualize the tf-idf values of science related newsgroup
tf_idf %>% filter(str_detect(newsgroup,"^sci\\.")) %>%
           group_by(newsgroup) %>%
           slice_max(tf_idf,
                     n = 12) %>%
           ungroup() %>%
           mutate(word = reorder(word,
                                 tf_idf)) %>%
           ggplot(aes(tf_idf,
                      word,
                      fill = newsgroup)) + 
           geom_col(show.legend = FALSE) + 
           facet_wrap(~ newsgroup,
                      scales = "free") +
           labs(x = "tf-idf",
                y = NULL)

Counting and correlating pairs of words with widyr package

newsgroup_cors <- words_by_newsgroup %>%
                  pairwise_cor(newsgroup, 
                               word,
                               n,
                               sort = TRUE)

Visualizing correlation as a network

set.seed(2017)

# Visualize the relationship between newsgroups in network graph
newsgroup_cors %>%
  filter(correlation > 0.025) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(alpha = correlation,
                      width = correlation)) +
  geom_node_point(size = 6,
                  color = "lightblue") +
  geom_node_text(aes(label = name),
                 color = "red",
                 # make sure that the labels do not overlap
                 repel = TRUE) +
  theme_void()

Extract all terms in a 2-words form - bigram

# Bigram data frame created using unnest_tokens()
bigrams <- cleaned_text %>%
            unnest_tokens(bigram,
                          text,
                          token = "ngrams",
                          n = 2)

bigrams_separated <- bigrams %>%
                     filter(bigrams != 'NA') %>%
                     separate(bigram, 
                              c("word1","word2"),
                              sep = " ")

bigrams_filtered <- bigrams_separated %>%
                    filter(!word1 %in% stop_words$word) %>%
                    filter(!word2 %in% stop_words$word)

bigram_counts <- bigrams_filtered %>%
                 count(word1, word2, sort = TRUE)

Create network graph from bigram data frame

# Network graph created using graph_from_data_frame() of igraph
bigram_graph <- bigram_counts %>%
                filter(n >3) %>%
                graph_from_data_frame()

bigram_graph

IGRAPH c2de036 DN-- 41 25 -- 
+ attr: name (v/c), n (e/n)
+ edges from c2de036 (vertex names):
 [1] NA         ->NA          1          ->2          
 [3] 1          ->3           static     ->void       
 [5] time       ->pad         1          ->4          
 [7] infield    ->fly         mat        ->28         
 [9] vv         ->vv          1          ->5          
[11] cock       ->crow        noticeshell->widget     
[13] 27         ->1993        3          ->4          
[15] child      ->molestation cock       ->crew       
+ ... omitted several edges

Visualizing a network of bigrams with ggraph

set.seed(1234)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name),
                 vjust = 1,
                 hjust = 1)

Improved version of the network graph with arrows

set.seed(1234)

# Create a closed arrow to show direction of word relationships
a <- grid::arrow(type = "closed",
                 length = unit(.15,
                               "inches"))

ggraph(bigram_graph,
       layout = "fr") +
  geom_edge_link(aes(edge_alpha = n),
                 show.legend = FALSE,
                 arrow = a,
                 end_cap = circle(0.07,
                                  'inches')) +
  geom_node_point(color = "lightblue",
                  size = 5) +
  geom_node_text(aes(label = name),
                 vjust = 1,
                 hjust = 1) +
  theme_void()