IMPORTANT: To use this notebook, you'll need to
ipython notebook
in the same directory where notebook and scripts were put
This work is licensed under a Creative Commons Attribution 4.0 International License.
# All these imports are needed for the following cells to run
import nltk
import os
import re
# import csv
import itertools
from nltk.tokenize import sent_tokenize, word_tokenize
import networkx as nx
import matplotlib.pyplot as plt
# from matplotlib.patches import Ellipse, Polygon
import numpy as np
# import math
# import random
import codecs
from textblob import TextBlob
import pandas as pd
%matplotlib inline
%load_ext autoreload
# Opening and reading the two speeches in NLTK
f1 = codecs.open('HillaryClintonCampaignLaunch6-12-2015.txt', "r", encoding="utf-8")
f2 = codecs.open('BernieSandersCampaignLaunch5-26-2015.txt', "r", encoding="utf-8")
f1Temp = codecs.open("tempASCII", "w", encoding="ascii", errors="ignore")
f2Temp = codecs.open("tempASCII", "w", encoding="ascii", errors="ignore")
f1Temp.write('HillaryClintonCampaignLaunch6-12-2015.txt')
f2Temp.write('BernieSandersCampaignLaunch5-26-2015.txt')
HClinton=f1.read()
BSanders=f2.read()
# Noun phrases are extracted from the two speeches using TextBlob
blobHClinton = TextBlob(HClinton)
blobBSanders = TextBlob(BSanders)
nounHC=blobHClinton.np_counts
nounBS = blobBSanders.np_counts
dfnp1=pd.DataFrame(columns=["H Clinton's noun phrases", "Frequencies"])
dfnp2=pd.DataFrame(columns=["B Sanders' noun phrases", "Frequencies"])
u=1
for l in nounHC:
dfnp1.loc[u]=[l,nounHC[l]]
u+=1
u=1
for l in nounBS:
dfnp2.loc[u]=[l,nounBS[l]]
u+=1
print "The total number of noun phrases in H Clinton's speech is", len(nounHC)
dfnp1.head(5)
# To see all the noun phrases in H Clinton's speech just run dfnp1 above
print "The total number of noun phrases in B Sanders' speech is", len(nounBS)
dfnp2.head(5)
# To see all the noun phrases in B Sanders' speech just run dfnp2 above
# Common noun phrases in H Clinton's and B Sanders speech
common={}
ncommon={}
dfc=pd.DataFrame(columns=["Common noun phrases", "H Clinton", "B Sanders"])
u=1
for l in nounHC:
if l in nounBS:
common[l]=min(nounHC[l],nounBS[l])
ncommon[l]=(nounHC[l],nounBS[l])
dfc.loc[u]=[l,nounHC[l],nounBS[l]]
u+=1
dfc
#The following noun phrases were removed:
to_be_removed = [u'well',u'thank',u'york']
for key in common.keys():
if key in to_be_removed:
del common[key]
ccommon=common.keys()
# This function counts co-occurrences of the common noun phrases in the sentences of a speech
def occurrences(source,terms):
ALL_sentences=sent_tokenize(source)
combinations_terms = list(itertools.combinations(terms,2))
n = len(combinations_terms)
occurlist =[]
for i in range(1,n):
for j in ALL_sentences:
temp= list(combinations_terms[i])
out = re.compile(str(temp[0])+'(.*?)'+str(temp[1]), re.DOTALL | re.IGNORECASE).findall(j)
if out :
occurlist.append(str(temp[0])+' - '+ str(temp[1]))
out2 = re.compile(str(temp[1])+'(.*?)'+str(temp[0]), re.DOTALL | re.IGNORECASE).findall(j)
if out2 :
occurlist.append(str(temp[1])+' - '+ str(temp[0]))
occurdict={}
for i in occurlist:
if i not in occurdict:
occurdict[i] = 1
else:
occurdict[i] = occurdict[i]+1
return occurdict
HCdict = occurrences(HClinton,ccommon)
BSdict = occurrences(BSanders,ccommon)
# This function constructs the graph of common noun phrases tied by their co-occurrences in sentenses of a speech
def makegraph(occurrences):
G = nx.Graph()
for eds,wei in occurrences.items():
ed=eds.split(' - ')
G.add_edge(ed[0],ed[1],weight=wei)
G.add_node(ed[0],label=ed[0])
G.add_node(ed[1],label=ed[1])
return G
GHC = makegraph(HCdict)
GBS = makegraph(BSdict)
print GHC.edges(data=True)
# Plotting the graph of common phrases in H Clinton's speech
from tools import draw_network
pos=nx.graphviz_layout(GHC)
# pos=nx.spring_layout(GHC)
sstt="Hillary Clinton's wordnet"
possit=draw_network(GHC,sstt,pos=pos,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15)
# Computing centralities of common phrases in the graph of H Clinton's speech
from tools import draw_centralities_subplots
centrali=draw_centralities_subplots(GHC,pos,withLabels=False,labfs=5,figsi=(15,22),ealpha=1,vals=True)
# The table of centralities of common phrases in the graph of H Clinton's speech
dfchc=pd.DataFrame()
dfchc.insert(0,'Common Nouns',centrali[centrali.keys()[0]].keys())
u=1
for i,k in centrali.items():
dfchc.insert(u,i,k.values())
dfchc
# Detecting communities of common phrases in the graph of H Clinton's speech
%autoreload 2
from tools import draw_comms, modul_arity, print_communities
part,nodper=print_communities(GHC,sstt)
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.2
ealpha=.7
vcc={}
sstta="The %s %s Communities" %(max(part.values())+1,sstt)
draw_comms(GHC,GHC.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,sstta,titlefont=20,labelfont=15,valpha=0.5)
# Plotting the graph of common phrases in B Sanders' speech
pos=nx.graphviz_layout(GBS)
# pos=nx.spring_layout(GBS)
sstt="Bernie Sanders's wordnet"
posit=draw_network(GBS,sstt,pos=pos,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15)
# Computing centralities of common phrases in the graph of B Sanders' speech
centrali=draw_centralities_subplots(GBS,pos,withLabels=False,labfs=5,figsi=(15,22),ealpha=1,vals=True)
# The table of centralities of common phrases in the graph of B Sanders' speech
dfcbs=pd.DataFrame()
dfcbs.insert(0,'Common Nouns',centrali[centrali.keys()[0]].keys())
u=1
for i,k in centrali.items():
dfcbs.insert(u,i,k.values())
dfcbs
# Detecting communities of common phrases in the graph of B Sanders' speech
%autoreload 2
part,nodper=print_communities(GBS,sstt)
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.2
ealpha=.7
vcc={}
sstta="The %s %s Communities" %(max(part.values())+1,sstt)
draw_comms(GBS,GBS.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,sstta,titlefont=20,labelfont=15,valpha=0.5)
# Plotting the Hillary Clinton vs. Bernie Sanders wordnet
G = nx.Graph()
G.add_edges_from(GHC.edges())
G.add_edges_from(GBS.edges())
pos=nx.graphviz_layout(G)
# pos=nx.spring_layout(G)
sstt="Hillary Clinton vs. Bernie Sanders wordnet \n Clinton's edges are cyan, Sanders' edges are green and common edges are red"
plt.figure(figsize=(15,15))
common_edges=set(GHC.edges()).intersection(set(GBS.edges()))
HCandBSedges = list(common_edges)
Cedges=set(GHC.edges()).difference(set(GBS.edges()))
HCedges = list(Cedges)
Sedges=set(GBS.edges()).difference(set(GHC.edges()))
BSedges = list(Sedges)
nx.draw_networkx_nodes(G,pos,withLabels=True,node_color='b',node_size=1000,alpha=0.15)
nx.draw_networkx_edges(G,pos,edgelist=HCandBSedges,width=5,alpha=0.5,edge_color='r')
nx.draw_networkx_edges(G,pos,edgelist=HCedges,width=5,alpha=0.5,edge_color='c')
nx.draw_networkx_edges(G,pos,edgelist=BSedges,width=5,alpha=0.5,edge_color='g')
nx.draw_networkx_labels(G,pos=pos,font_size=15)
plt.title(sstt,fontsize=20)
kk=plt.axis('off')