Text Mining of Series Scripts with R

In [1]:
library(rvest)

# Which tv show, if you want another show, first check on the website which tv show url is used. 
tvshow <- "orange-is-the-new-black"
 
# Creating download directory and change to it
directory = paste("~/Data Analysis/files/", tvshow, sep="")
dir.create(directory, recursive = TRUE, showWarnings = FALSE)
setwd(directory)
 
# Setting base url and complte url
baseurl <- "http://www.springfieldspringfield.co.uk/"
url <- paste(baseurl,"episode_scripts.php?tv-show=", tvshow, sep="")
Loading required package: xml2
In [2]:
# read the HTML page
scrape_url <- read_html(url)
# node selector
s_selector <- ".season-episode-title"
 
# scrape href nodes in .season-episode-title
all_urls_season <- html_nodes(scrape_url, s_selector) %>% html_attr("href")
In [3]:
# str(all_urls_season)
In [4]:
# head(all_urls_season)
In [5]:
# tail(all_urls_season)
In [6]:
# Loop through all season urls 
for (i in all_urls_season) {
  uri <- read_html(paste(baseurl, i, sep="/"))
  # same thing here first check which node we need to select, so forst do a inspect of the site
  script_selector <- ".scrolling-script-container"
  # scrape all script text to a variable
  text <- html_nodes(uri, script_selector) %>% html_text()
 
  # Get last five characters of all_urls_season as season for saving this to seperate text files
  substrRight <- function(x, n) {
    substr(x, nchar(x)-n+1, nchar(x))
  }
  seasons <- substrRight(i, 5)
  # Write each script to a seperate text file
  write.csv(text, file = paste(directory, "/", tvshow, "_", seasons, ".txt", sep=""), row.names = FALSE)
}
In [7]:
library(tm)

# set filepath to scripts
cname <- file.path(directory)
# see if the filepath contains our scripts
# (docname <- dir(cname))
docname <- dir(cname)
Loading required package: NLP
In [8]:
# Crete a Corpus of the text files so we can do some analysis
docs <- Corpus(DirSource(cname), readerControl = list(id=docname))
# Show summary of the Corpus, we have all documents in our Corpus
# summary(docs)
In [9]:
# Inspect the first document, it has 26533 characters
# inspect(docs[1])
In [10]:
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("english"))
In [11]:
library(SnowballC)

docs <- tm_map(docs, stemDocument)
docs <- tm_map(docs, stripWhitespace)
In [12]:
# inspect(docs[1])
In [13]:
docs <- tm_map(docs, PlainTextDocument)
In [14]:
# Create a tdm
tdm <- TermDocumentMatrix(docs)
# Add readable columnnames, in our case the document filename
docname <- gsub("orange-is-the-new-black_", "",docname)
docname <- gsub(".txt", "",docname)
# docname <- paste(docname, sep="")
docname <- paste("s",docname, sep="")
colnames(tdm) <- docname
# Show and inspect the tdm
# tdm
In [15]:
# inspect(tdm[1:10,1:6])
In [16]:
dtm <- DocumentTermMatrix(docs)
rownames(dtm) <- docname
# dtm
In [17]:
# inspect(dtm[1:10,1:6])
In [18]:
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq,20)
know
2589
like
2335
dont
2281
get
1705
right
1688
just
1614
youre
1594
yeah
1564
got
1426
gonna
1301
thats
1179
well
1160
okay
1119
now
1115
can
1068
think
1020
shit
999
hey
945
one
895
good
851
In [19]:
tf <- data.frame(term=names(freq), freq=freq)     
# head(tf,20)
In [20]:
# descending sort of teh tf by freq
tf$term <- factor(tf$term, levels = tf$term[order(-tf$freq)])
library(ggplot2)
Attaching package: ‘ggplot2’

The following object is masked from ‘package:NLP’:

    annotate

In [21]:
p <- ggplot(subset(tf, freq>800), aes(term, freq))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
p
In [22]:
library(tm)

tdm.common = removeSparseTerms(tdm, sparse = 0.001) #0.1)
# tdm
In [23]:
# tdm.common
In [24]:
dim(tdm)
  1. 16175
  2. 52
In [25]:
dim(tdm.common)
  1. 72
  2. 52
In [26]:
# inspect(tdm.common[1:10,1:6])
In [27]:
tdm.common.mod <- tdm.common[c(-4,-5,-9,-10,-19,-22,-53,-60,-61,-68,-69,-72),]
dim(tdm.common.mod)
  1. 60
  2. 52
In [28]:
tdm.dense.mod <- as.matrix(tdm.common.mod)
dim(tdm.dense.mod)
  1. 60
  2. 52
In [29]:
# tdm.dense.mod
In [30]:
library(reshape2)
tdm.dense.mod.m <- melt(tdm.dense.mod, value.name = "count")
head(tdm.dense.mod.m)
TermsDocscount
back s01e01 9
bad s01e01 4
call s01e01 7
cause s01e01 1
come s01e0131
day s01e01 3
In [31]:
library(ggplot2)
ggplot(tdm.dense.mod.m, aes(x = Docs, y = Terms, fill = log10(count))) +
     geom_tile(colour = "white") +
     scale_fill_gradient(high="steelblue" , low="white")+
     ylab("") +
     theme(panel.background = element_blank()) +
     theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
In [32]:
corr <- cor(tdm.dense.mod)
library(corrplot)
corrplot(corr, method = "circle", type = "upper", tl.col="black", tl.cex=0.7)
In [33]:
tdm.dense.mod.t <- t(tdm.dense.mod)
corr.t <- cor(tdm.dense.mod.t)
corrplot(corr.t,method = "circle", type = "upper", tl.col="black", tl.cex=0.7)