IMPORTANT: To use this notebook, you'll need to
ipython notebook
in the same directory where notebook and scripts were put
This work is licensed under a Creative Commons Attribution 4.0 International License.
import random
import nltk
import codecs
from textblob import TextBlob
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import os
import re
import imp
import seaborn as sns # pip install seaborn
sns.set_style("white") # For seaborn to show axes in iPython Notebook
from lightning import Lightning
from numpy import random, asarray, sqrt, arctan2, pi, clip
from seaborn import color_palette
from sklearn import datasets
from colorsys import hsv_to_rgb
import itertools
from nltk.tokenize import sent_tokenize, word_tokenize
# utilsdir='/Users/mosesboudourides/Dropbox/Python Projects/EUSN2016_LiteraryTextNetworksWorkshop/utils/'
# utilsdir='/home/mab/Dropbox/Python Projects/utils/'#tools.py'
utilsdir='/home/mab/Dropbox/Python Projects/EUSN2016_LiteraryTextNetworksWorkshop/utils/'
# utilsdir='/home/sergios-len/Dropbox/Python Projects (1)/EUSN2016_LiteraryTextNetworksWorkshop/utils/'
%matplotlib inline
%load_ext autoreload
# The transcript of Hillary Clinton’s Speech on National Security on 6-2-2016 was downloaded from
# https://www.hillaryclinton.com/briefing/statements/2016/06/02/transcript-hillary-clinton-delivers-major-national-security-address/ (accessed on 6-10-2016)
# The transcript of Donald Trump's Speech on National Security on 4-27-2016 was downloaded from
# http://www.nytimes.com/2016/04/28/us/politics/transcript-trump-foreign-policy.html?_r=0 (accessed on 6-10-2016)
filenameA = 'texts/HilaryClintonOnNationalScurity6-2-2016.txt'
filenameB = 'texts/DonaldTrumpOnNationalSecurity4-27-2016.txt'
titlenameA = "Hillary Clinton's Speech on National Security, 6-2-2016"
titlenameB = "Donald Trump's Speech on National Security, 4-27-2016"
ttA = "Hillary Clinton's speech"
ttB = "Donald Trump's speech"
nnA = "Hillary Clinton's noun phrases"
nnB = "Donald Trump's noun phrases"
wwA = "Hillary Clinton's Network of Common Noun Phrases"
wwB = "Donald Trump's Network of Common Noun Phrases"
fA = codecs.open(filenameA, "r", encoding="utf-8").read()
fB = codecs.open(filenameB, "r", encoding="utf-8").read()
num_linesA = 0
num_wordsA = 0
num_charsA = 0
for line in fA:
words = line.split()
num_linesA += 1
num_wordsA += len(words)
num_charsA += len(line)
print "%s, has number of words = %i and number of characters = %i" %(titlenameA,num_wordsA,num_charsA)
num_linesB = 0
num_wordsB = 0
num_charsB = 0
for line in fB:
words = line.split()
num_linesB += 1
num_wordsB += len(words)
num_charsB += len(line)
print "%s, has number of words = %i and number of characters = %i" %(titlenameB,num_wordsB,num_charsB)
blobA = TextBlob(fA)
blobB = TextBlob(fB)
npA = blobA.np_counts
npB = blobB.np_counts
dfnp1 = pd.DataFrame(columns=[nnA, "Frequencies"])
dfnp2 = pd.DataFrame(columns=[nnB, "Frequencies"])
u=1
for l in npA:
dfnp1.loc[u]=[l,npA[l]]
u+=1
u=1
for l in npB:
dfnp2.loc[u]=[l,npB[l]]
u+=1
print "The total number of noun phrases in %s is %i" %(ttA,len(npA))
cdfnp1 = dfnp1.drop(dfnp1.index[[]])
sdfnp1 = cdfnp1.sort_values(by=['Frequencies'], ascending=False)
sdfnp1.head(10)
print "The total number of noun phrases in %s is %i" %(ttB,len(npB))
cdfnp2 = dfnp2.drop(dfnp2.index[[]])
sdfnp2 = cdfnp2.sort_values(by=['Frequencies'], ascending=False)
sdfnp2.head(10)
common={}
ncommon={}
dfc=pd.DataFrame(columns=["Common noun phrases", ttA, ttB])
u=1
for l in npA:
if l in npB:
common[l]=min(npA[l],npB[l])
ncommon[l]=(npA[l],npB[l])
dfc.loc[u]=[l,npA[l],npB[l]]
u+=1
# dfc = dfc.sort_values(by=[ttA], ascending=False)
dfc
cdfc = dfc.drop(dfc.index[[0,2,4,22,23,25,27,28,31]])
cdfc = cdfc.sort_values(by=[ttA], ascending=False)
cdfc
# to_be_removed = [u'well',u'thank',u'york']
# for key in common.keys():
# if key in to_be_removed:
# del common[key]
# ccommon=common.keys()
# print ccommon
ccommon=cdfc['Common noun phrases'].tolist()
def occurrences(source,terms):
ALL_sentences=sent_tokenize(source)
combinations_terms = list(itertools.combinations(terms,2))
n = len(combinations_terms)
# print n,combinations_terms[0]
# print ALL_sentences[0]
# print ALL_sentences[1]
occurlist =[]
for i in range(n):
for j in ALL_sentences:
temp= list(combinations_terms[i])
out = re.compile(str(temp[0])+'(.*?)'+str(temp[1]), re.DOTALL | re.IGNORECASE).findall(j)
if out :
occurlist.append(tuple(temp))
out2 = re.compile(str(temp[1])+'(.*?)'+str(temp[0]), re.DOTALL | re.IGNORECASE).findall(j)
if out2 :
occurlist.append(tuple(temp))
occurdict={}
for i in occurlist:
if i not in occurdict:
occurdict[i] = 1
else:
occurdict[i] = occurdict[i]+1
return occurdict
HCdict = occurrences(fA,ccommon)
BSdict = occurrences(fB,ccommon)
def makegraph(occurrences):
G = nx.Graph()
for ed,wei in occurrences.items():
G.add_edge(ed[0],ed[1],weight=wei)
G.add_node(ed[0],label=ed[0])
G.add_node(ed[1],label=ed[1])
return G
GHC = makegraph(HCdict)
GBS = makegraph(BSdict)
%autoreload 2
tool= imp.load_source('tools', utilsdir+'tools.py')
posHC=nx.spring_layout(GHC,scale=50,k=0.4,iterations=20)
# posHC=nx.graphviz_layout(GHC)
sstt=wwA
posit=tool.draw_network(GHC,sstt,pos=posHC,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15)
from tools import draw_centralities_subplots
centrali=draw_centralities_subplots(GHC,pos=posHC,withLabels=False,labfs=5,figsi=(15,22),ealpha=1,vals=True)
dfc=pd.DataFrame()
u=0
for i,k in centrali.items():
dfc.insert(u,i,k.values())
u+=1
dfc.insert(0,'Nodes',centrali[centrali.keys()[0]].keys())
dfc
%autoreload 2
from tools import draw_comms, modul_arity, print_communities
part,nodper=print_communities(GHC,sstt)
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.2
ealpha=.7
vcc={}
sstta="The %s Communities of %s" %(max(part.values())+1,sstt)
draw_comms(GHC,GHC.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,sstta,titlefont=20,labelfont=15,valpha=0.5)
posBS=nx.spring_layout(GBS,scale=50,k=0.4,iterations=20)
# posBS=nx.graphviz_layout(GHC)
sstt=wwB
posit=tool.draw_network(GBS,sstt,pos=posBS,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15)
centrali=draw_centralities_subplots(GBS,pos=posBS,withLabels=False,labfs=5,figsi=(15,22),ealpha=1,vals=True)
dfc=pd.DataFrame()
u=0
for i,k in centrali.items():
dfc.insert(u,i,k.values())
u+=1
dfc.insert(0,'Nodes',centrali[centrali.keys()[0]].keys())
dfc
part,nodper=print_communities(GBS,sstt)
# d=0.8
# dd=0.8
# c=1.2
# cc=1.4
# alpha=0.2
# ealpha=.7
# vcc={}
sstta="The %s Communities of %s" %(max(part.values())+1,sstt)
draw_comms(GBS,GBS.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,sstta,titlefont=20,labelfont=15,valpha=0.5)
G = nx.Graph()
G.add_edges_from(GHC.edges())
G.add_edges_from(GBS.edges())
pos=nx.spring_layout(G,scale=50,k=0.4,iterations=20)
# pos=nx.graphviz_layout(G)
sstt="Hilary Clinton vs. Donald Trump's Networks of Common Noun Phrases \n (Clinton's edges are cyan, Trump's edges are green and common edges are red)"
plt.figure(figsize=(15,15))
G1 = GHC.to_directed()
G2 = GBS.to_directed()
X=nx.Graph()
X.add_edges_from(list(set(G1.edges()).intersection(set(G2.edges()))))
HCandBSedges = list(X.edges())
HCedges = [e for e in GHC.edges() if e not in list(set(G1.edges()).intersection(set(G2.edges())))]
BSedges = [e for e in GBS.edges() if e not in list(set(G1.edges()).intersection(set(G2.edges())))]
nx.draw_networkx_nodes(G,pos,withLabels=True,node_color='b',node_size=1000,alpha=0.15)
# nx.draw_networkx_edges(G,pos,width=1.0,alpha=0.5)
nx.draw_networkx_edges(G,pos,edgelist=HCandBSedges,width=5,alpha=0.5,edge_color='red')
nx.draw_networkx_edges(G,pos,edgelist=HCedges,width=5,alpha=0.5,edge_color='c')
nx.draw_networkx_edges(G,pos,edgelist=BSedges,width=5,alpha=0.5,edge_color='g')
nx.draw_networkx_labels(G,pos=pos,font_size=15)
plt.title(sstt,fontsize=20)
kk=plt.axis('off')