Text Visualization

A short description of the post.

Published

July 11, 2021

DOI

Installing and launching R packages

packages = c("tidytext","widyr","wordcloud",
             "DT","ggwordcloud","textplot",
             "lubridate","hms","tidyverse",
             "tidygraph","ggraph","igraph")
for (p in packages){
  if(!require(p,character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}

Import Multiple Text Files from Multiple Folders

Step 1: Creating a folder list

news20 <- "data/20news/"

Step 2: Define a function to read all files from a folder into a data frame

read_folder <- function(infolder){
  tibble(file = dir(infolder,
                    full.names=TRUE)) %>%
    mutate(text=map(file,
                    read_lines)) %>%
    transmute(id=basename(file),
              text)%>%
    unnest(text)
}

Step 3: Reading in all the messages from the 20news folder

raw_text <- tibble(folder=dir(news20,
                              full.names = TRUE)) %>%
  mutate(folder_out = map(folder,
                          read_folder))%>%
  unnest(cols = c(folder_out))%>%
  transmute(newsgroup=basename(folder),
            id,text)
write_rds(raw_text,"data/rds/news20.rds")

Initial EDA

raw_text %>%
  group_by(newsgroup) %>%
  summarise(messages = n_distinct(id))%>%
  ggplot(aes(messages,newsgroup))+
  geom_col(fill="lightblue")+
  labs(y=NULL)

Cleaning Text Data

Removing header and automated email signitures

cleaned_text <- raw_text %>%
  group_by(newsgroup,id) %>%
  filter(cumsum(text == "")>0,
         cumsum(str_detect(
           text,"^--")) == 0) %>%
  ungroup()

Removing lines with nested text representing quotes from other users.

cleaned_text <- cleaned_text%>%
  filter(str_detect(text,"^[^>]+[A-Za-z\\d]")
         |text == "",
         !str_detect(text,
                     "writes(:|\\.\\.\\.)$"),
         !str_detect(text,
                     "^In article <"))

Text Data Processing

usenet_words <- cleaned_text %>%
unnest_tokens(word, text) %>%
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)

check the frequency of words

usenet_words %>%
  count(word,sort=TRUE)
# A tibble: 5,542 x 2
   word           n
   <chr>      <int>
 1 people        57
 2 time          50
 3 jesus         47
 4 god           44
 5 message       40
 6 br            27
 7 bible         23
 8 drive         23
 9 homosexual    23
10 read          22
# … with 5,532 more rows

count words within by newsgroup

words_by_newsgroup <- usenet_words %>%
  count(newsgroup, word, sort = TRUE) %>%
  ungroup()

Visualising Words in newsgroups

Using wordcloud package

wordcloud(words_by_newsgroup$word,
          words_by_newsgroup$n,
          max.words = 300)

Computing tf-idf within newsgroups

tf_idf <- words_by_newsgroup %>%
  bind_tf_idf(word, newsgroup, n) %>%
  arrange(desc(tf_idf))

Visualising tf-idf as interactive table

DT::datatable(tf_idf,filter="top")%>%
  formatRound(columns = c('tf','idf','tf_idf'),
              digits=3)%>%
  formatStyle(0,target='row',lineHeight='25%')

Visualising tf-idf within newsgroups

tf_idf%>%
  filter(str_detect(newsgroup,"^sci\\."))%>%
  group_by(newsgroup)%>%
  slice_max(tf_idf,
            n=12)%>%
  ungroup()%>%
  mutate(word=reorder(word,tf_idf))%>%
  ggplot(aes(tf_idf,
             word,
             fill=newsgroup))+
  geom_col(show.legend=FALSE)+
  facet_wrap(~newsgroup,
             scales="free")+
  labs(x="tf-idf",
       y=NULL)

Counting and correlating pairs of words with the widyr

newsgroup_cors <- words_by_newsgroup%>%
  pairwise_cor(newsgroup,
               word,
               n,
               sort=TRUE)

Visualising correlation as a network

set.seed(2017)

newsgroup_cors%>%
  filter(correlation > .025)%>%
  graph_from_data_frame()%>%
  ggraph(layout="fr") +
  geom_edge_link(aes(alpha=correlation,
                     width=correlation)) +
  geom_node_point(size=6,
                  color="lightblue")+
  geom_node_text(aes(label=name),
                 color = "red",
                 repel = TRUE)+
  theme_void()

Bigram

bigrams <- cleaned_text%>%
  unnest_tokens(bigram,
                text,
                token = "ngrams",
                n=2)

Counting bigrams

bigrams_count <- bigrams %>%
  filter(bigram != 'NA') %>%
  count(bigram, sort = TRUE)

Cleaning bigram

bigrams_separated <- bigrams %>%
  filter(bigram != 'NA') %>%
  separate(bigram, c("word1", "word2"),
           sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

counting bigram

bigram_counts <- bigrams_filtered %>%
  count(word1, word2, sort = TRUE)

create a network graph from bigrams

bigram_graph <- bigram_counts %>%
  filter(n > 3) %>%
  graph_from_data_frame()
bigram_graph
IGRAPH 2918050 DN-- 40 24 -- 
+ attr: name (v/c), n (e/n)
+ edges from 2918050 (vertex names):
 [1] 1          ->2           1          ->3          
 [3] static     ->void        time       ->pad        
 [5] 1          ->4           infield    ->fly        
 [7] mat        ->28          vv         ->vv         
 [9] 1          ->5           cock       ->crow       
[11] noticeshell->widget      27         ->1993       
[13] 3          ->4           child      ->molestation
[15] cock       ->crew        gun        ->violence   
+ ... omitted several edges

create a network graph from bigrams

set.seed(1234)
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name),
                 vjust = 1,
                 hjust = 1)

improved version

set.seed(1234)
a <- grid::arrow(type = "closed",
                 length = unit(.15,
                               "inches"))
ggraph(bigram_graph,
       layout = "fr") +
  geom_edge_link(aes(edge_alpha = n),
                 show.legend = FALSE,
                 arrow = a,
                 end_cap = circle(.07,
                                  'inches')) +
  geom_node_point(color = "lightblue",
                  size = 5) +
  geom_node_text(aes(label = name),
                 vjust = 1,
                 hjust = 1) +
  theme_void()