A short description of the post.
packages = c("tidytext","widyr","wordcloud",
"DT","ggwordcloud","textplot",
"lubridate","hms","tidyverse",
"tidygraph","ggraph","igraph")
for (p in packages){
if(!require(p,character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
Step 1: Creating a folder list
news20 <- "data/20news/"
Step 2: Define a function to read all files from a folder into a data frame
read_folder <- function(infolder){
tibble(file = dir(infolder,
full.names=TRUE)) %>%
mutate(text=map(file,
read_lines)) %>%
transmute(id=basename(file),
text)%>%
unnest(text)
}
Step 3: Reading in all the messages from the 20news folder
raw_text <- tibble(folder=dir(news20,
full.names = TRUE)) %>%
mutate(folder_out = map(folder,
read_folder))%>%
unnest(cols = c(folder_out))%>%
transmute(newsgroup=basename(folder),
id,text)
write_rds(raw_text,"data/rds/news20.rds")
raw_text %>%
group_by(newsgroup) %>%
summarise(messages = n_distinct(id))%>%
ggplot(aes(messages,newsgroup))+
geom_col(fill="lightblue")+
labs(y=NULL)
cleaned_text <- raw_text %>%
group_by(newsgroup,id) %>%
filter(cumsum(text == "")>0,
cumsum(str_detect(
text,"^--")) == 0) %>%
ungroup()
cleaned_text <- cleaned_text%>%
filter(str_detect(text,"^[^>]+[A-Za-z\\d]")
|text == "",
!str_detect(text,
"writes(:|\\.\\.\\.)$"),
!str_detect(text,
"^In article <"))
usenet_words <- cleaned_text %>%
unnest_tokens(word, text) %>%
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word)
usenet_words %>%
count(word,sort=TRUE)
# A tibble: 5,542 x 2
word n
<chr> <int>
1 people 57
2 time 50
3 jesus 47
4 god 44
5 message 40
6 br 27
7 bible 23
8 drive 23
9 homosexual 23
10 read 22
# … with 5,532 more rows
words_by_newsgroup <- usenet_words %>%
count(newsgroup, word, sort = TRUE) %>%
ungroup()
wordcloud(words_by_newsgroup$word,
words_by_newsgroup$n,
max.words = 300)
tf_idf <- words_by_newsgroup %>%
bind_tf_idf(word, newsgroup, n) %>%
arrange(desc(tf_idf))
DT::datatable(tf_idf,filter="top")%>%
formatRound(columns = c('tf','idf','tf_idf'),
digits=3)%>%
formatStyle(0,target='row',lineHeight='25%')
tf_idf%>%
filter(str_detect(newsgroup,"^sci\\."))%>%
group_by(newsgroup)%>%
slice_max(tf_idf,
n=12)%>%
ungroup()%>%
mutate(word=reorder(word,tf_idf))%>%
ggplot(aes(tf_idf,
word,
fill=newsgroup))+
geom_col(show.legend=FALSE)+
facet_wrap(~newsgroup,
scales="free")+
labs(x="tf-idf",
y=NULL)
newsgroup_cors <- words_by_newsgroup%>%
pairwise_cor(newsgroup,
word,
n,
sort=TRUE)
set.seed(2017)
newsgroup_cors%>%
filter(correlation > .025)%>%
graph_from_data_frame()%>%
ggraph(layout="fr") +
geom_edge_link(aes(alpha=correlation,
width=correlation)) +
geom_node_point(size=6,
color="lightblue")+
geom_node_text(aes(label=name),
color = "red",
repel = TRUE)+
theme_void()
bigrams <- cleaned_text%>%
unnest_tokens(bigram,
text,
token = "ngrams",
n=2)
bigrams_count <- bigrams %>%
filter(bigram != 'NA') %>%
count(bigram, sort = TRUE)
bigrams_separated <- bigrams %>%
filter(bigram != 'NA') %>%
separate(bigram, c("word1", "word2"),
sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_graph <- bigram_counts %>%
filter(n > 3) %>%
graph_from_data_frame()
bigram_graph
IGRAPH 2918050 DN-- 40 24 --
+ attr: name (v/c), n (e/n)
+ edges from 2918050 (vertex names):
[1] 1 ->2 1 ->3
[3] static ->void time ->pad
[5] 1 ->4 infield ->fly
[7] mat ->28 vv ->vv
[9] 1 ->5 cock ->crow
[11] noticeshell->widget 27 ->1993
[13] 3 ->4 child ->molestation
[15] cock ->crew gun ->violence
+ ... omitted several edges
set.seed(1234)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name),
vjust = 1,
hjust = 1)
set.seed(1234)
a <- grid::arrow(type = "closed",
length = unit(.15,
"inches"))
ggraph(bigram_graph,
layout = "fr") +
geom_edge_link(aes(edge_alpha = n),
show.legend = FALSE,
arrow = a,
end_cap = circle(.07,
'inches')) +
geom_node_point(color = "lightblue",
size = 5) +
geom_node_text(aes(label = name),
vjust = 1,
hjust = 1) +
theme_void()