IMPORTANT: To use this notebook, you'll need to
ipython notebook
in the same directory where notebook and scripts were put
This work is licensed under a Creative Commons Attribution 4.0 International License.
import random
import nltk
import codecs
from textblob import TextBlob
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import os
import imp
# utilsdir='/home/sergios-len/Dropbox/Python Projects (1)/utils/'#tools.py'
utilsdir='/home/mab/Dropbox/Python Projects/utils/'#tools.py'
%matplotlib inline
%load_ext autoreload
filename = 'Hamlet: Entire Play.txt'
titlename = "Shakespear's Hamlet"
f=open(filename)
num_lines = 0
num_words = 0
num_chars = 0
names=set()
for line in f:
words = line.split()
num_lines += 1
num_words += len(words)
num_chars += len(line)
lin=line.strip()
if len(lin)>0:
if lin[0]=='*' and lin[-1]=='*':
lin=lin[1:-1]
if '*' in lin:
nlin=lin.split()
for nl in nlin:
nl=nl.strip('*')
names.add(nl.strip(':'))
else:
names.add(lin)
print "%s has number of words = %i and number of characters = %i" %(titlename,num_words,num_chars)
f = codecs.open(filename, "r", encoding="utf-8").read()
# "names" are defined as the the words in lines of the text that including a single word.
list(names)
dici={'GUILDENSTERN':'GUILDENSTERN',
'First Priest':'Priest',
'Priest':'Priest',
'LUCIANUS':'Players',
'Players':'Players',
'OSRIC':'Osric',
'PRINCE FORTINBRAS':'FORTINBRAS',
'FORTINBRAS':'FORTINBRAS',
'First Ambassador':'Ambassadors',
'Ambassadors':'Ambassadors',
'MARCELLUS':'Marcelius',
'ROSENCRANTZ':'ROSENCRANTZ',
'Captain':'Captain',
'Player King':'Players',
'Ghost':'Ghost',
'First Sailor':'Sailors',
'Sailor':'Sailors',
'HORATIO':'Horatio',
'BERNARDO':'Bernardo',
'Servant':'Servant',
'LAERTES':'Laertes',
'HAMLET':'Hamlet',
'Lord Hamlet':'Hamlet',
'Second Clown':'Clowns',
'First Player':'Players',
'First Clown':'Clowns',
'FRANCISCO':'Francisco',
'LORD POLONIUS':'Polonius',
'OPHELIA':'Ophelia',
'Danes':'Danes',
'REYNALDO':'Reynaldo',
'VOLTIMAND':'VOLTIMAND',
'KING CLAUDIUS':'Claudius',
'Claudius':'Claudius',
'Player Queen':'Players',
'QUEEN GERTRUDE':'Gertrude',
'Gertrude':'Gertrude',
'Messenger':'Messenger',
'CORNELIUS':'Cornelius'
}
ndici={i.lower():k.lower().capitalize() for i,k in dici.items()}
dnici=[(i.split()[0],i.split()[1]) for i in ndici.keys() if len(i.split())>1]
selectedTerms=ndici.keys()
new_text={l:{} for l in selectedTerms}
f=open(filename)
chars=[]
texx=''
u=0
for lin in f:
lin=lin.strip()
if '*' in lin:
if len(chars)>0 and len(texx)>0:
for cr in chars:
new_text[cr[0]][cr[1]]=texx
chars=[]
line= lin.split('*')
for ll in line:
if ll.lower() in selectedTerms:
chars.append((ll.lower(),u))
texx=''
elif len(lin)>0 and lin[0] =='/':
continue
else:
if len(chars)>0:
texx+=lin
u+=1
%autoreload 2
tool= imp.load_source('tools', utilsdir+'tools.py')
# print dir(tool)
create_pandas_dataframe_from_text_par=tool.create_pandas_dataframe_from_text_par
create_coo_graph=tool.create_coo_graph
dfst,sec_prot,coccurlist,occurlist,dflines=create_pandas_dataframe_from_text_par(new_text,selectedTerms,ndici,titlename)
co_graph=create_coo_graph(coccurlist)
dfst.rename(columns={"Shakespeare's Hamlet selected terms":"Shakespeare's Hamlet"},inplace=True)
# dfst.rename(columns={"Arthur Conan Doyle's A Study in Scarlet selected terms":"Arthur Conan Doyle's A Study in Scarlet Characters"},inplace=True)
dfst.sort_values(by='Frequencies').sort(["Frequencies"], ascending=[0])
prot_pol_sub=dflines[['narrator','protagonists','#_of_protagonists','polarity','subjectivity']].reset_index()
prot_pol_sub['sentence_id']=prot_pol_sub.index
prot_pol_sub=prot_pol_sub[['sentence_id','narrator','protagonists','#_of_protagonists','polarity','subjectivity']]
cuts = 1
prot_pol_sub = prot_pol_sub[prot_pol_sub['#_of_protagonists']>cuts]
lp = prot_pol_sub['protagonists'].tolist()
lpn = []
for i in lp:
for j in i:
lpn.append(j)
# len(set(lpn))
print "The total number of sentences in %s with at least %i characters in each one of them is %i." %(titlename,cuts+1,len(prot_pol_sub))
prot_pol_sub.rename(columns={'protagonists':'Lists_of_Characters','#_of_protagonists':'#_of_Characters','polarity':'Polarity','subjectivity':'Subjectivity'},inplace=True)
prot_pol_sub.sort(["#_of_Characters"], ascending=[0])
ddff = prot_pol_sub.drop('sentence_id', 1)
ddff.index.name = 'Sentence_ID'
ddff
prot_pol_sub[['#_of_Characters','Polarity','Subjectivity']].describe()
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
ndfl=dflines[dflines['#_of_protagonists']>0 ]
fig, ax = plt.subplots(figsize=[12, 10])
axes2 = zoomed_inset_axes(ax, 12, loc=10) # zoom = 6
dflines['#_of_protagonists'].plot.hist(ax=ax)
ax.set_xlabel('#_of_Characters')
ax.set_ylabel('Frequency')
ax.set_title('Histogram of # of characters')
x1, x2, y1, y2 = 3, 3.1, 0, 30
axes2.set_xlim(x1, x2)
axes2.set_ylim(y1, y2)
ndfl['#_of_protagonists'].plot.hist(ax=axes2)
axes2.set_ylabel('Frequency')
mark_inset(ax, axes2, loc1=2, loc2=4, fc="none", ec="0.5")
axes3 = zoomed_inset_axes(ax, 10, loc=6)
x1, x2, y1, y2 = 2, 2.1, 0, 60
axes3.set_xlim(x1, x2)
axes3.set_ylim(y1, y2)
ndfl['#_of_protagonists'].plot.hist(ax=axes3)
axes3.set_ylabel('Frequency')
mark_inset(ax, axes3, loc1=2, loc2=4, fc="none", ec="0.5")
axes4 = zoomed_inset_axes(ax, 10, loc=7)
x1, x2, y1, y2 = 3.9, 4, 0, 20
axes4.set_xlim(x1, x2)
axes4.set_ylim(y1, y2)
ndfl['#_of_protagonists'].plot.hist(ax=axes4)
axes3.set_ylabel('Frequency')
mark_inset(ax, axes4, loc1=2, loc2=4, fc="none", ec="0.5")
plt.show()
%autoreload 2
import math
draw_network_node_color=tool.draw_network_node_color
sstt="%s Two-Mode Network of Sentences and Characters" %titlename
pos=nx.spring_layout(sec_prot)
nds=[nd for nd in sec_prot.nodes() if isinstance(nd,int)]
prot=[nd for nd in sec_prot.nodes() if nd not in nds]
qua=len(nds)/4.
for en,nd in enumerate(nds):
if en<qua:
if en%2==0:
rr=1.
else:
rr=.8
the=math.pi*en/(190*72./len(nds))
pos[nd][0]=rr*math.cos(the)
pos[nd][1]=rr*math.sin(the)
elif en<len(nds)/2. and en>qua:
ens=en-qua
the=math.pi*ens/(190*72./len(nds))
if en%2==0:
rr=1.
else:
rr=.8
pos[nd][0]=-rr*math.cos(the)
pos[nd][1]=rr*math.sin(the)
elif en<3*qua and en>2*qua:
ens=en-2*qua
the=math.pi*ens/(190*72./len(nds))
if en%2==0:
rr=1.
else:
rr=.8
pos[nd][0]=-rr*math.cos(the)
pos[nd][1]=-rr*math.sin(the)
else:
ens=en-3*qua
the=math.pi*ens/(190*72./len(nds))
if en%2==0:
rr=1.
else:
rr=.8
pos[nd][0]=rr*math.cos(the)
pos[nd][1]=-rr*math.sin(the)
for en ,nd in enumerate(prot):
pos[nd][0]=0
pos[nd][1]=-0.95+en*2./len(prot)
possit=draw_network_node_color(sec_prot,sstt,pos=pos,with_edgewidth=False,withLabels=True,labfs=10,valpha=0.2,
ealpha=0.3,labelfont=5,with_node_weight=False,node_size_fixer=10.,node_col='polarity',
node_size_def=None)
possit=draw_network_node_color(sec_prot,sstt,pos=pos,with_edgewidth=False,withLabels=True,labfs=12,valpha=0.2,
ealpha=0.3,labelfont=15,with_node_weight=False,node_size_fixer=300.,
node_col='subjectivity',colormat='Greens')
%autoreload 2
draw_network=tool.draw_network
make_graph_from_lists=tool.make_graph_from_lists
plist = prot_pol_sub['Lists_of_Characters'].tolist()
pplist=prot_pol_sub['Polarity'].tolist()
nplist=prot_pol_sub['#_of_Characters'].tolist()
splist=prot_pol_sub['Subjectivity'].tolist()
G = make_graph_from_lists(plist,pplist,nplist,splist)
posg=nx.spring_layout(G,scale=50,k=0.55,iterations=20)
# posg=nx.spring_layout(G,scale=50)#,k=0.55)#,iterations=20)
sstt="%s Network of Selected Characters \n(Sentences colored in polarity)" %titlename
possit=draw_network(G,sstt,pos=posg,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
with_edgecolor=True,edgecolor='polarity',colormat='Blues')
sstt="%s Network of Selected Characters \n(Sentences Colored in Subjectivity)" %titlename
possit=draw_network(G,sstt,pos=posg,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
with_edgecolor=True,edgecolor='subjectivity',colormat='Greys')
draw_centralities_subplots=tool.draw_centralities_subplots
centrali=draw_centralities_subplots(G,pos=posg,withLabels=False,labfs=5,figsi=(15,22),ealpha=1,vals=True)
dfc=pd.DataFrame()
u=0
for i,k in centrali.items():
dfc.insert(u,i,k.values())
u+=1
dfc.insert(0,'Nodes',centrali[centrali.keys()[0]].keys())
dfc
%autoreload 2
draw_comms=tool.draw_comms
modul_arity=tool.modul_arity
print_communities=tool.print_communities
part,nodper=print_communities(G,sstt)
ndfl=dflines[dflines['#_of_protagonists']>0 ]
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.2
ealpha=0.2
vcc={}
sstta="The %s Communities of %s Network of Characters" %(max(part.values())+1,titlename)
draw_comms(G,G.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,sstta,titlefont=20,labelfont=17,valpha=0.5)
%autoreload 2
trj= imp.load_source('trajectories', utilsdir+'trajectories_t.py')
def search_in_list(x):
l=x['protagonists']
return 'Hamlet' in l
ndfl=dflines[dflines['#_of_protagonists']>1 ]
dialogs=ndfl[ndfl.apply(search_in_list,axis=1) ==True]
protagonists=dialogs.protagonists.tolist()
start=range(1,len(protagonists)+1)
end=range(2,len(protagonists)+2)
polarities=dialogs.polarity.tolist()
subj=dialogs.subjectivity.tolist()
qq=0
figi=None
search_name='Hamlet'
G,ndls,pold,subjd=trj.creatTestGraph_pandas_bips(start,end,protagonists,search_name,polarities,subj)
trajpdfs=trj.main_work_search_name(G,ndls,qq,figi,search_name,verb=False,plot_first_mode=False)
trajpdfs["['Hamlet']"]
%autoreload 2
import igraph as ig
igraph_draw_traj=tool.igraph_draw_traj
filname='S_out_graphs/%s_graph.graphml' %search_name
g,visual_style,layout=igraph_draw_traj(filname,pold)
ig.plot(g, **visual_style)
g,visual_style,layout=igraph_draw_traj(filname,subjd,polar=False,layout=layout)
ig.plot(g, **visual_style)
def search_in_list(x):
l=x['protagonists']
return 'Gertrude' in l
ndfl=dflines[dflines['#_of_protagonists']>1 ]
dialogs=ndfl[ndfl.apply(search_in_list,axis=1) ==True]
protagonists=dialogs.protagonists.tolist()
start=range(1,len(protagonists)+1)
end=range(2,len(protagonists)+2)
polarities=dialogs.polarity.tolist()
subj=dialogs.subjectivity.tolist()
qq=0
figi=None
search_name='Gertrude'
G,ndls,pold,subjd=trj.creatTestGraph_pandas_bips(start,end,protagonists,search_name,polarities,subj)
trajpdfs=trj.main_work_search_name(G,ndls,qq,figi,search_name,verb=False,plot_first_mode=False)
trajpdfs["['Gertrude']"]
%autoreload 2
import igraph as ig
filname='S_out_graphs/%s_graph.graphml' %search_name
g=None
g,visual_style,layout=igraph_draw_traj(filname,pold)
ig.plot(g, **visual_style)
g,visual_style,layout=igraph_draw_traj(filname,subjd,polar=False,layout=layout)
ig.plot(g, **visual_style)