Text Mining & Word Clouds

This section is in reference to text mining and word clouds. I have used 2 large data sets for this Martin Luther King’s I Have a Dream speech and a Hunger Games book

Text mining, creating word clouds, and word analysis with two different word populations.

MLK speech

File (.TXT of his speech)

Hunger Games

File (.TXT of book)

Packages and upload

# Load
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
library(Rcpp)

#read in data
text <- readLines(file.choose())

# Load the data as a corpus
docs <- Corpus(VectorSource(text))

Clean data

#inspect(docs)

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")

# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("blabla1", "blabla2")) 
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
# docs <- tm_map(docs, stemDocument)
docs <- tm_map(docs, removeWords, ("and"))

#inspect(docs)

dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

head(d, 20)

tail(d, 20)

Build Clouds

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

findFreqTerms(dtm, lowfreq = 4)

findFreqTerms(dtm, lowfreq = 5)

findAssocs(dtm, terms = "district", corlimit = 0.3)
findAssocs(dtm, terms = "but", corlimit = 0.3)
findAssocs(dtm, terms = "thresh", corlimit = 0.3)

head(d, 10)

MLK I have a dream speech Word Cloud
Hunger Games Word Cloud

Bar Chart of word frequency

barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

MLK I have a dream speech word frequency
Hunger Games word frequency