# Which tv show, if you want another show, first check on the website which tv show url is used.
tvshow <- "orange-is-the-new-black"
# Creating download directory and change to it
directory = paste("~/Data Analysis/files/", tvshow, sep="")
dir.create(directory, recursive = TRUE, showWarnings = FALSE)
# Setting base url and complte url
baseurl <- ""
url <- paste(baseurl,"episode_scripts.php?tv-show=", tvshow, sep="")
# read the HTML page
scrape_url <- read_html(url)
# node selector
s_selector <- ".season-episode-title"
# scrape href nodes in .season-episode-title
all_urls_season <- html_nodes(scrape_url, s_selector) %>% html_attr("href")
# str(all_urls_season)
# head(all_urls_season)
# tail(all_urls_season)
# Loop through all season urls
for (i in all_urls_season) {
uri <- read_html(paste(baseurl, i, sep="/"))
# same thing here first check which node we need to select, so forst do a inspect of the site
script_selector <- ".scrolling-script-container"
# scrape all script text to a variable
text <- html_nodes(uri, script_selector) %>% html_text()
# Get last five characters of all_urls_season as season for saving this to seperate text files
substrRight <- function(x, n) {
substr(x, nchar(x)-n+1, nchar(x))
seasons <- substrRight(i, 5)
# Write each script to a seperate text file
write.csv(text, file = paste(directory, "/", tvshow, "_", seasons, ".txt", sep=""), row.names = FALSE)
# set filepath to scripts
cname <- file.path(directory)
# see if the filepath contains our scripts
# (docname <- dir(cname))
docname <- dir(cname)
# Crete a Corpus of the text files so we can do some analysis
docs <- Corpus(DirSource(cname), readerControl = list(id=docname))
# Show summary of the Corpus, we have all documents in our Corpus
# summary(docs)
# Inspect the first document, it has 26533 characters
# inspect(docs[1])
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, stemDocument)
docs <- tm_map(docs, stripWhitespace)
# inspect(docs[1])
docs <- tm_map(docs, PlainTextDocument)
# Create a tdm
tdm <- TermDocumentMatrix(docs)
# Add readable columnnames, in our case the document filename
docname <- gsub("orange-is-the-new-black_", "",docname)
docname <- gsub(".txt", "",docname)
# docname <- paste(docname, sep="")
docname <- paste("s",docname, sep="")
colnames(tdm) <- docname
# Show and inspect the tdm
# tdm
# inspect(tdm[1:10,1:6])
dtm <- DocumentTermMatrix(docs)
rownames(dtm) <- docname
# dtm
# inspect(dtm[1:10,1:6])
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
tf <- data.frame(term=names(freq), freq=freq)
# head(tf,20)
# descending sort of teh tf by freq
tf$term <- factor(tf$term, levels = tf$term[order(-tf$freq)])
p <- ggplot(subset(tf, freq>800), aes(term, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
tdm.common = removeSparseTerms(tdm, sparse = 0.001) #0.1)
# tdm
# tdm.common
# inspect(tdm.common[1:10,1:6])
tdm.common.mod <- tdm.common[c(-4,-5,-9,-10,-19,-22,-53,-60,-61,-68,-69,-72),]
tdm.dense.mod <- as.matrix(tdm.common.mod)
# tdm.dense.mod
tdm.dense.mod.m <- melt(tdm.dense.mod, = "count")
ggplot(tdm.dense.mod.m, aes(x = Docs, y = Terms, fill = log10(count))) +
geom_tile(colour = "white") +
scale_fill_gradient(high="steelblue" , low="white")+
ylab("") +
theme(panel.background = element_blank()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
corr <- cor(tdm.dense.mod)
corrplot(corr, method = "circle", type = "upper", tl.col="black", tl.cex=0.7)
tdm.dense.mod.t <- t(tdm.dense.mod)
corr.t <- cor(tdm.dense.mod.t)
corrplot(corr.t,method = "circle", type = "upper", tl.col="black", tl.cex=0.7)
# tdm.dense.mod.t
# Corrplot for dominant negative correlations
tdm.dense.mod.t %>% correlate() %>% network_plot(min_cor = 0.35)
# Corrplot for dominant negative correlations
tdm.dense.mod.t %>% correlate() %>% network_plot(min_cor = 0.4)