DataViz Blog: Text Visualization

packages = c("tidytext","widyr","wordcloud",
             "DT","ggwordcloud","textplot",
             "lubridate","hms","tidyverse",
             "tidygraph","ggraph","igraph")
for (p in packages){
  if(!require(p,character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}

Import Multiple Text Files from Multiple Folders

news20 <- "data/20news/"

read_folder <- function(infolder){
  tibble(file = dir(infolder,
                    full.names=TRUE)) %>%
    mutate(text=map(file,
                    read_lines)) %>%
    transmute(id=basename(file),
              text)%>%
    unnest(text)
}

raw_text <- tibble(folder=dir(news20,
                              full.names = TRUE)) %>%
  mutate(folder_out = map(folder,
                          read_folder))%>%
  unnest(cols = c(folder_out))%>%
  transmute(newsgroup=basename(folder),
            id,text)
write_rds(raw_text,"data/rds/news20.rds")

Initial EDA

raw_text %>%
  group_by(newsgroup) %>%
  summarise(messages = n_distinct(id))%>%
  ggplot(aes(messages,newsgroup))+
  geom_col(fill="lightblue")+
  labs(y=NULL)

Cleaning Text Data

Removing header and automated email signitures

cleaned_text <- raw_text %>%
  group_by(newsgroup,id) %>%
  filter(cumsum(text == "")>0,
         cumsum(str_detect(
           text,"^--")) == 0) %>%
  ungroup()

Removing lines with nested text representing quotes from other users.

cleaned_text <- cleaned_text%>%
  filter(str_detect(text,"^[^>]+[A-Za-z\\d]")
         |text == "",
         !str_detect(text,
                     "writes(:|\\.\\.\\.)$"),
         !str_detect(text,
                     "^In article <"))

Text Data Processing

usenet_words <- cleaned_text %>%
unnest_tokens(word, text) %>%
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)

check the frequency of words

usenet_words %>%
  count(word,sort=TRUE)

count words within by newsgroup

words_by_newsgroup <- usenet_words %>%
  count(newsgroup, word, sort = TRUE) %>%
  ungroup()

Visualising Words in newsgroups

Using wordcloud package

wordcloud(words_by_newsgroup$word,
          words_by_newsgroup$n,
          max.words = 300)

Computing tf-idf within newsgroups

tf_idf <- words_by_newsgroup %>%
  bind_tf_idf(word, newsgroup, n) %>%
  arrange(desc(tf_idf))

Visualising tf-idf as interactive table

DT::datatable(tf_idf,filter="top")%>%
  formatRound(columns = c('tf','idf','tf_idf'),
              digits=3)%>%
  formatStyle(0,target='row',lineHeight='25%')

Show entries

Search:

	newsgroup	word	n	tf	idf	tf_idf
			1 39	0.000715819613457 0.048286604361371	0.223143551314209 2.995732273553990	0.000293609935939 0.095579003860131
1	rec.autos	car	11	0.042	2.303	0.096
2	sci.crypt	pad	20	0.031	2.996	0.093
3	comp.sys.ibm.pc.hardware	cpu	11	0.030	2.996	0.090
4	rec.sport.hockey	pts	17	0.027	2.996	0.081
5	talk.politics.misc	homosexuals	19	0.026	2.996	0.078
6	comp.windows.x	widget	18	0.026	2.996	0.078
7	sci.crypt	message	31	0.048	1.609	0.078
8	comp.sys.mac.hardware	vv	12	0.023	2.996	0.070
9	talk.politics.guns	charles	13	0.023	2.996	0.068
10	comp.windows.x	static	15	0.022	2.996	0.065

Showing 1 to 10 of 8,124 entries

Previous1 2 3 4 5…813Next

Visualising tf-idf within newsgroups

tf_idf%>%
  filter(str_detect(newsgroup,"^sci\\."))%>%
  group_by(newsgroup)%>%
  slice_max(tf_idf,
            n=12)%>%
  ungroup()%>%
  mutate(word=reorder(word,tf_idf))%>%
  ggplot(aes(tf_idf,
             word,
             fill=newsgroup))+
  geom_col(show.legend=FALSE)+
  facet_wrap(~newsgroup,
             scales="free")+
  labs(x="tf-idf",
       y=NULL)

Counting and correlating pairs of words with the widyr

newsgroup_cors <- words_by_newsgroup%>%
  pairwise_cor(newsgroup,
               word,
               n,
               sort=TRUE)

Visualising correlation as a network

set.seed(2017)

newsgroup_cors%>%
  filter(correlation > .025)%>%
  graph_from_data_frame()%>%
  ggraph(layout="fr") +
  geom_edge_link(aes(alpha=correlation,
                     width=correlation)) +
  geom_node_point(size=6,
                  color="lightblue")+
  geom_node_text(aes(label=name),
                 color = "red",
                 repel = TRUE)+
  theme_void()

Bigram

bigrams <- cleaned_text%>%
  unnest_tokens(bigram,
                text,
                token = "ngrams",
                n=2)

Counting bigrams

bigrams_count <- bigrams %>%
  filter(bigram != 'NA') %>%
  count(bigram, sort = TRUE)

Cleaning bigram

bigrams_separated <- bigrams %>%
  filter(bigram != 'NA') %>%
  separate(bigram, c("word1", "word2"),
           sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

counting bigram

bigram_counts <- bigrams_filtered %>%
  count(word1, word2, sort = TRUE)

create a network graph from bigrams

bigram_graph <- bigram_counts %>%
  filter(n > 3) %>%
  graph_from_data_frame()
bigram_graph

create a network graph from bigrams

set.seed(1234)
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name),
                 vjust = 1,
                 hjust = 1)

improved version

set.seed(1234)
a <- grid::arrow(type = "closed",
                 length = unit(.15,
                               "inches"))
ggraph(bigram_graph,
       layout = "fr") +
  geom_edge_link(aes(edge_alpha = n),
                 show.legend = FALSE,
                 arrow = a,
                 end_cap = circle(.07,
                                  'inches')) +
  geom_node_point(color = "lightblue",
                  size = 5) +
  geom_node_text(aes(label = name),
                 vjust = 1,
                 hjust = 1) +
  theme_void()

Text Visualization

Author

Affiliation

Published

DOI

Installing and launching R packages

Import Multiple Text Files from Multiple Folders

Initial EDA

Cleaning Text Data

Removing header and automated email signitures

Removing lines with nested text representing quotes from other users.

Text Data Processing

check the frequency of words

count words within by newsgroup

Visualising Words in newsgroups

Using wordcloud package

Computing tf-idf within newsgroups

Visualising tf-idf as interactive table

Visualising tf-idf within newsgroups

Counting and correlating pairs of words with the widyr

Visualising correlation as a network

Bigram

Counting bigrams

Cleaning bigram

counting bigram

create a network graph from bigrams

create a network graph from bigrams

improved version

Footnotes