A short description of the post.
packages = c("tidytext","widyr","wordcloud",
"DT","ggwordcloud","textplot",
"lubridate","hms","tidyverse",
"tidygraph","ggraph","igraph")
for (p in packages){
if(!require(p,character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
Step 1: Creating a folder list
news20 <- "data/20news/"
Step 2: Define a function to read all files from a folder into a data frame
Step 3: Reading in all the messages from the 20news folder
raw_text %>%
group_by(newsgroup) %>%
summarise(messages = n_distinct(id))%>%
ggplot(aes(messages,newsgroup))+
geom_col(fill="lightblue")+
labs(y=NULL)

cleaned_text <- cleaned_text%>%
filter(str_detect(text,"^[^>]+[A-Za-z\\d]")
|text == "",
!str_detect(text,
"writes(:|\\.\\.\\.)$"),
!str_detect(text,
"^In article <"))
usenet_words <- cleaned_text %>%
unnest_tokens(word, text) %>%
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)
usenet_words %>%
count(word,sort=TRUE)
# A tibble: 5,542 x 2
word n
<chr> <int>
1 people 57
2 time 50
3 jesus 47
4 god 44
5 message 40
6 br 27
7 bible 23
8 drive 23
9 homosexual 23
10 read 22
# … with 5,532 more rows
words_by_newsgroup <- usenet_words %>%
count(newsgroup, word, sort = TRUE) %>%
ungroup()
wordcloud(words_by_newsgroup$word,
words_by_newsgroup$n,
max.words = 300)

tf_idf <- words_by_newsgroup %>%
bind_tf_idf(word, newsgroup, n) %>%
arrange(desc(tf_idf))
tf_idf%>%
filter(str_detect(newsgroup,"^sci\\."))%>%
group_by(newsgroup)%>%
slice_max(tf_idf,
n=12)%>%
ungroup()%>%
mutate(word=reorder(word,tf_idf))%>%
ggplot(aes(tf_idf,
word,
fill=newsgroup))+
geom_col(show.legend=FALSE)+
facet_wrap(~newsgroup,
scales="free")+
labs(x="tf-idf",
y=NULL)

newsgroup_cors <- words_by_newsgroup%>%
pairwise_cor(newsgroup,
word,
n,
sort=TRUE)
set.seed(2017)
newsgroup_cors%>%
filter(correlation > .025)%>%
graph_from_data_frame()%>%
ggraph(layout="fr") +
geom_edge_link(aes(alpha=correlation,
width=correlation)) +
geom_node_point(size=6,
color="lightblue")+
geom_node_text(aes(label=name),
color = "red",
repel = TRUE)+
theme_void()

bigrams <- cleaned_text%>%
unnest_tokens(bigram,
text,
token = "ngrams",
n=2)
bigrams_count <- bigrams %>%
filter(bigram != 'NA') %>%
count(bigram, sort = TRUE)
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_graph <- bigram_counts %>%
filter(n > 3) %>%
graph_from_data_frame()
bigram_graph
IGRAPH 2918050 DN-- 40 24 --
+ attr: name (v/c), n (e/n)
+ edges from 2918050 (vertex names):
[1] 1 ->2 1 ->3
[3] static ->void time ->pad
[5] 1 ->4 infield ->fly
[7] mat ->28 vv ->vv
[9] 1 ->5 cock ->crow
[11] noticeshell->widget 27 ->1993
[13] 3 ->4 child ->molestation
[15] cock ->crew gun ->violence
+ ... omitted several edges
set.seed(1234)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name),
vjust = 1,
hjust = 1)

set.seed(1234)
a <- grid::arrow(type = "closed",
length = unit(.15,
"inches"))
ggraph(bigram_graph,
layout = "fr") +
geom_edge_link(aes(edge_alpha = n),
show.legend = FALSE,
arrow = a,
end_cap = circle(.07,
'inches')) +
geom_node_point(color = "lightblue",
size = 5) +
geom_node_text(aes(label = name),
vjust = 1,
hjust = 1) +
theme_void()
