from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
IMPORTANT: To use this notebook, you'll need to
ipython notebook
in the same directory where notebook and scripts were put
This work is licensed under a Creative Commons Attribution 4.0 International License.
# All these imports are needed for the following cells to run
import nltk
import os
import re
# import csv
import itertools
from nltk.tokenize import sent_tokenize, word_tokenize
import networkx as nx
import matplotlib.pyplot as plt
# from matplotlib.patches import Ellipse, Polygon
import numpy as np
# import math
# import random
import codecs
from textblob import TextBlob
import pandas as pd
%matplotlib inline
%load_ext autoreload
## Opening and reading the two speeches in NLTK
# Opening and reading the two speeches in NLTK
f1 = codecs.open('t1.txt', "r", encoding="utf-8")
f2 = codecs.open('t2.txt', "r", encoding="utf-8")
f1Temp = codecs.open("tempASCII", "w", encoding="ascii", errors="ignore")
f2Temp = codecs.open("tempASCII", "w", encoding="ascii", errors="ignore")
f1Temp.write('t1_02_15.txt')
f2Temp.write('t2_07_15.txt')
HClinton=f1.read()
BSanders=f2.read()
# Noun phrases are extracted from the two speeches using TextBlob
blobHClinton = TextBlob(HClinton)
blobBSanders = TextBlob(BSanders)
nounHC=blobHClinton.np_counts
nounBS = blobBSanders.np_counts
title1="Tsipras' 8 February 2015"
title2="Tsipras' 11 July 2015"
dfnp1=pd.DataFrame(columns=["%s noun phrases" %title1, "Frequencies"])
dfnp2=pd.DataFrame(columns=["%s noun phrases" %title2, "Frequencies"])
u=1
for l in nounHC:
dfnp1.loc[u]=[l,nounHC[l]]
u+=1
u=1
for l in nounBS:
dfnp2.loc[u]=[l,nounBS[l]]
u+=1
print "The total number of noun phrases in %s speech is %i. The following table shows the first 5 entries." %(title1,len(nounHC))
dfnp1.head(5)
# To see all the noun phrases in H Clinton's speech just run dfnp1 above
print "The total number of noun phrases in %s speech is %i. The following table shows the first 5 entries." %(title2,len(nounBS))
dfnp2.head(5)
# To see all the noun phrases in B Sanders' speech just run dfnp2 above
# Common noun phrases in H Clinton's and B Sanders speech
common={}
ncommon={}
dfc=pd.DataFrame(columns=["Common noun phrases", title1, title2])
u=1
for l in nounHC:
if l in nounBS:
common[l]=min(nounHC[l],nounBS[l])
ncommon[l]=(nounHC[l],nounBS[l])
dfc.loc[u]=[l,nounHC[l],nounBS[l]]
u+=1
dfc
#The following noun phrases were removed:
to_be_removed = ['prime','ladies','gentlemen']#[u'well',u'thank',u'york']
for key in common.keys():
if key in to_be_removed:
del common[key]
ccommon=common.keys()
## This function counts co-occurrences of the common noun phrases in the sentences of a speech
# This function counts co-occurrences of the common noun phrases in the sentences of a speech
def occurrences(source,terms):
ALL_sentences=sent_tokenize(source)
combinations_terms = list(itertools.combinations(terms,2))
n = len(combinations_terms)
# print n,combinations_terms[0]
# print ALL_sentences[0]
# print ALL_sentences[1]
occurlist =[]
for i in range(n):
for j in ALL_sentences:
temp= list(combinations_terms[i])
out = re.compile(str(temp[0])+'(.*?)'+str(temp[1]), re.DOTALL | re.IGNORECASE).findall(j)
if out :
occurlist.append(tuple(temp))
out2 = re.compile(str(temp[1])+'(.*?)'+str(temp[0]), re.DOTALL | re.IGNORECASE).findall(j)
if out2 :
occurlist.append(tuple(temp))
occurdict={}
for i in occurlist:
if i not in occurdict:
occurdict[i] = 1
else:
occurdict[i] = occurdict[i]+1
return occurdict
HCdict = occurrences(HClinton,ccommon)
BSdict = occurrences(BSanders,ccommon)
# print HCdict.keys()
## This function constructs the graph of common noun phrases tied by their co-occurrences in sentenses of a speech
# This function constructs the graph of common noun phrases tied by their co-occurrences in sentenses of a speech
def makegraph(occurrences):
G = nx.Graph()
for ed,wei in occurrences.items():
# print eds
# ed=eds.split(' - ')
G.add_edge(ed[0],ed[1],weight=wei)
G.add_node(ed[0],label=ed[0])
G.add_node(ed[1],label=ed[1])
return G
GHC = makegraph(HCdict)
# print GHC.edges(data=True)
GBS = makegraph(BSdict)
# print GBS.edges(data=True)
# Plotting the graph of common phrases in H Clinton's speech
from tools import draw_network
pos=nx.graphviz_layout(GHC)
# pos=nx.spring_layout(GHC)
sstt="%s wordnet" %title1
possit=draw_network(GHC,sstt,pos=pos,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15)
# Computing centralities of common phrases in the graph of H Clinton's speech
from tools import draw_centralities_subplots
centrali=draw_centralities_subplots(GHC,pos,withLabels=False,labfs=5,figsi=(15,22),ealpha=1,vals=True)
# The table of centralities of common phrases in the graph of H Clinton's speech
dfchc=pd.DataFrame()
dfchc.insert(0,'Common Nouns',centrali[centrali.keys()[0]].keys())
u=1
for i,k in centrali.items():
dfchc.insert(u,i,k.values())
dfchc
# Detecting communities of common phrases in the graph of H Clinton's speech
%autoreload 2
from tools import draw_comms, modul_arity, print_communities
part,nodper=print_communities(GHC,sstt)
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.2
ealpha=.7
vcc={}
sstta="The %s %s Communities" %(max(part.values())+1,sstt)
draw_comms(GHC,GHC.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,sstta,titlefont=20,labelfont=15,valpha=0.5)
# Plotting the graph of common phrases in B Sanders' speech
pos=nx.graphviz_layout(GBS)
# pos=nx.spring_layout(GBS)
sstt="%s wordnet" %title2
posit=draw_network(GBS,sstt,pos=pos,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15)
# Computing centralities of common phrases in the graph of B Sanders' speech
centrali=draw_centralities_subplots(GBS,pos,withLabels=False,labfs=5,figsi=(15,22),ealpha=1,vals=True)
# The table of centralities of common phrases in the graph of B Sanders' speech
dfcbs=pd.DataFrame()
dfcbs.insert(0,'Common Nouns',centrali[centrali.keys()[0]].keys())
u=1
for i,k in centrali.items():
# print i,k
try:
dfcbs.insert(u,i,k.values())
except:
continue
dfcbs
# Detecting communities of common phrases in the graph of B Sanders' speech
%autoreload 2
part,nodper=print_communities(GBS,sstt)
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.2
ealpha=.7
vcc={}
sstta="The %s %s Communities" %(max(part.values())+1,sstt)
draw_comms(GBS,GBS.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,sstta,titlefont=20,labelfont=15,valpha=0.5)
# Plotting the Hilary Clinton vs. Bernie Sanders wordnet
G = nx.Graph()
G.add_edges_from(GHC.edges())
G.add_edges_from(GBS.edges())
# pos=nx.graphviz_layout(G)
pos=nx.spring_layout(G)
sstt="%s vs. %s wordnet \n %s edges are cyan, %s edges are green and common edges are red" %(title1,title2,title1,title2)
plt.figure(figsize=(15,15))
common_edges=set(GHC.edges()).intersection(set(GBS.edges()))
HCandBSedges = list(common_edges)
Cedges=set(GHC.edges()).difference(set(GBS.edges()))
HCedges = list(Cedges)
Sedges=set(GBS.edges()).difference(set(GHC.edges()))
BSedges = list(Sedges)
nx.draw_networkx_nodes(G,pos,withLabels=True,node_color='b',node_size=1000,alpha=0.15)
nx.draw_networkx_edges(G,pos,edgelist=HCandBSedges,width=5,alpha=0.5,edge_color='r')
nx.draw_networkx_edges(G,pos,edgelist=HCedges,width=5,alpha=0.5,edge_color='c')
nx.draw_networkx_edges(G,pos,edgelist=BSedges,width=5,alpha=0.5,edge_color='g')
nx.draw_networkx_labels(G,pos=pos,font_size=15)
plt.title(sstt,fontsize=20)
kk=plt.axis('off')