### Text analysis of social media data
### (Day 3) course notes.
### Using the 'thebananagirl' facebook page data.
### (part of the healthy food movement).
## ensure packages are loaded
library("tm")
library("wordcloud")
library("RColorBrewer")
library("SnowballC")
library("ape")
library("dendextend")
library("stringr")
library("topicmodels")
library("slam")
library("Rmpfr")
# read in the dataset for this session
fbData <- read.csv("thebananagirl_facebook_data.csv")
# an alternative dataset we could use / test out, etc
# fbData <- read.csv("rawtil4_facebook_data.csv")
# view information about this dataframe (which was generated using SocialMediaLab)
str(fbData)
# we just want to keep the vector of comments, for this work.
fbData <- fbData$commentText
## some data cleaning
# we only want comments with text (many rows represent simply 'likes', as it is bimodal)
# remove comments with no text (empty)
toRemove <- which(fbData=="")
fbData <- fbData[-toRemove] # remove 'offending' rows
# remove rows with NA values
toRemove <- which(is.na(fbData))
# remove 'offending' rows
fbData <- fbData[-toRemove]
length(fbData) # tells us how many comments we have to work with
# convert the character encoding to UTF-8. avoids errors relating to char encoding.
fbData <- iconv(fbData, to = 'utf-8')
# mac users should use this:
fbData <- iconv(fbData,to="utf-8-mac")
# using 'tm' package we convert character vector to a Vcorpus object (volatile corpus)
fbCorpus <- VCorpus(VectorSource(fbData))
# inspect the first 5 observations
inspect(fbCorpus[1:5])
# individual comments accessed via:
fbCorpus[[4]]
## now we do transformations of text using tm_map ('mapping to the corpus')
# eliminate extra whitespace
fbCorpus <- tm_map(fbCorpus, stripWhitespace)
# convert to all lowercase
fbCorpus <- tm_map(fbCorpus, content_transformer(tolower))
# perform stemming (not always useful!)
fbCorpus <- tm_map(fbCorpus, stemDocument)
# remove numbers (not always useful!)
fbCorpus <- tm_map(fbCorpus, removeNumbers)
# remove punctuation (not always useful! e.g. text emoticons)
fbCorpus <- tm_map(fbCorpus, removePunctuation)
# remove stop words (not always useful!)
fbCorpus <- tm_map(fbCorpus, removeWords, stopwords("english"))
# look at the difference now
fbCorpus[[4]]
# ... and we can also use our own stopwords
myStopwords <- c("banana")
fbCorpus <- tm_map(fbCorpus, removeWords, myStopwords)
# now we can see this comment is missing "banana"!
fbCorpus[[4]]
# create a document-term matrix
dtm <- DocumentTermMatrix(fbCorpus,control = list(wordLengths=c(3, Inf)))
dtm
# remove sparse terms
removeSparseTerms(dtm, 0.99) # tells us info. This is art and science.
dtmSparseRemoved <- removeSparseTerms(dtm, 0.99) # extremely sparse matrix; extreme threshold for removal
## BEGIN ANALYSIS ########
# find frequent terms in corpus
freqTerms <- colSums(as.matrix(dtmSparseRemoved))
freqTerms
orderTerms <- order(freqTerms,decreasing=TRUE)
# 5 most frequent terms:
freqTerms[head(orderTerms)]
# 10 least frequent terms:
freqTerms[orderTerms[(length(freqTerms)-10):length(freqTerms)]]
findFreqTerms(dtmSparseRemoved, 300) # occurred at least 300 times
# find correlations between terms
# note: for extremely sparse matrices, this may not be that useful
# if two words always appear together, then corr = 1
# if never, then corr = 0
findAssocs(dtmSparseRemoved, "meat", corlimit=0.25)
wordcloud(names(freqTerms), freqTerms, min.freq=40,max.words=100,colors=brewer.pal(6, "Dark2"),scale=c(5, .5))
#############################################
## TOPIC MODELING using package 'topicmodels'
# create a dtm (the fast way this time)
dtmTopicModeling <- DocumentTermMatrix(fbCorpus,control = list(stemming = TRUE, stopwords = TRUE, wordLengths=c(3, Inf), tolower = TRUE, removeNumbers = TRUE, removePunctuation = TRUE))
dtmTopicModeling <- removeSparseTerms(dtmTopicModeling, 0.998)
# Perform term frequency-inverse document frequency (tf-idf)
summary(col_sums(dtmTopicModeling))
term_tfidf <- tapply(dtmTopicModeling$v/row_sums(dtmTopicModeling)[dtmTopicModeling$i], dtmTopicModeling$j, mean) * log2(nDocs(dtmTopicModeling)/col_sums(dtmTopicModeling > 0))
summary(term_tfidf)
dtmTopicModeling <- dtmTopicModeling[, term_tfidf >= 0.6]
# remember which rows were removed
toRemove <- which(row_sums(dtmTopicModeling) == 0,)
dtmTopicModeling <- dtmTopicModeling[row_sums(dtmTopicModeling) > 0,]
# now we need to calculate k (number of topics)
# we use an approach of model selection by harmonic mean
# see: http://stackoverflow.com/a/21394092/2589495
harmonicMean <- function(logLikelihoods, precision=2000L) {
library("Rmpfr")
llMed <- median(logLikelihoods)
as.double(llMed - log(mean(exp(-mpfr(logLikelihoods,
prec = precision) + llMed))))
}
# The log-likelihood values are then determined by first fitting the model using for example
# k = 20
burnin = 1000
iter = 100 # better to be 1000, but speed is a factor here
keep = 50
# generate numerous topic models with different numbers of topics
sequ <- seq(2, 100, 25)
# you can time how long this takes
ptm <- proc.time()
fitted_many <- lapply(sequ, function(k) LDA(dtmTopicModeling, k = k, method = "Gibbs",control = list(burnin = burnin, iter = iter, keep = keep) ))
proc.time() - ptm
# extract logliks from each topic
logLiks_many <- lapply(fitted_many, function(L) L@logLiks[-c(1:(burnin/keep))])
# compute harmonic means
hm_many <- sapply(logLiks_many, function(h) harmonicMean(h))
# we can plot the results to inspect visually
plot(sequ, hm_many, type = "l")
# compute optimum number of topics
sequ[which.max(hm_many)]
# we just assign this directly to variable k
k <- sequ[which.max(hm_many)]
# set a seed number so we can reproduce the results
seedNum <- 2010
# lda <- LDA(dtmTopicModeling, k = k, method = "Gibbs",control = list(seed = seedNum))
lda <- LDA(dtmTopicModeling, k = k, method = "Gibbs", control = list(burnin = burnin, iter = iter, keep = keep, seed=seedNum))
# find out the top 5 terms for each topic
topFiveTermsEachTopic <- terms(lda,5)
# find out which which topic each document has been assigned to (highest probability)
topicsProb <- topics(lda,1)
# let's look at one topic in particular. Topic 4 looks interesting
topFiveTermsEachTopic[,4]
# get the indices of documents (i.e. comments) in topic 4
topic4Comments <- which(topicsProb==4)
# we can look at original comments dataset, but need to delete the removed rows (from tf-idf)
fbData_LDA <- fbData[-toRemove]
# now we can view all comments from topic 4
topic4CommentsText <- as.list(fbData_LDA[topic4Comments])
# let's view a sample
set.seed(500)
sample(topic4CommentsText,10)
# The next steps will likely involve qualititive analysis....