The Network among Sentential Co-Occurrences¶

of Noun Phrases in¶

Kirby's "The Hero of Esthonia"¶

By the DH in Estonia 2016 Conference Team¶

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

Importing Python Modules¶

import random
import nltk
import codecs
from textblob import TextBlob
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import imp
from ipywidgets import widgets
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline 
%load_ext autoreload

I. Importing the Text of Kirby's The Hero of Esthonia¶

filename = 'Kirby1.txt'
titlename = "Kirby's The Hero of Esthonia"

f = codecs.open(filename, "r", encoding="utf-8").readlines()

num_lines = 0
num_words = 0
num_chars = 0
for line in f:
    words = line.split()
    num_lines += 1
    num_words += len(words)
    num_chars += len(line)
print "%s has number of words = %i (and number of characters/symbols = %i)" %(titlename,num_words,num_chars)
# print "%s has number of words = %i" %(titlename,num_words)
blob = TextBlob("\n".join(f))

Kirby's The Hero of Esthonia has number of words = 27692 (and number of characters/symbols = 152768)

II. Extracting the Most Frequent Noun Phrases in Kirby's Kalevipoeg¶

# nltk.download()

all_sents=blob.sentences
occurdic=Counter()
for sen in all_sents:
    dd=sen.dict
    for np in dd['noun_phrases']:
        occurdic[np]+=1

df = pd.DataFrame(columns=["%s Noun Phrases" %titlename, "Frequencies"])
u=1
for l,v in occurdic.items(): 
    df.loc[u]=[l,v]
    u+=1

print "The total number of noun phrases in %s is %i." %(titlename,len(df))#len(npA))
df.sort(["Frequencies"], ascending=[0])

cut = 10
df = df[df['Frequencies']>cut].sort(["Frequencies"], ascending=[0])
print "The total number of noun phrases in %s with frequencies > %i is %i." %(titlename,cut,len(df))#len(npA))
df.sort(["Frequencies"], ascending=[0])

The total number of noun phrases in Kirby's The Hero of Esthonia is 1191.
The total number of noun phrases in Kirby's The Hero of Esthonia with frequencies > 10 is 17.

excluded = ['runo','footnote']

%autoreload 2

selectedTerms={}
# excluded = ['who','will','exactly','enough','shall','suppose','well']
for k in df["Kirby's The Hero of Esthonia Noun Phrases"].tolist(): #df["Plato's Phaedrus Noun Phrases"].tolist():
    if k not in excluded:
        selectedTerms[k] = k.capitalize()

# tool= imp.load_source('tools', utilsdir+'/tools.py')
import tools as tool
create_pandas_dataframe_from_text=tool.create_pandas_dataframe_from_text  
dfst,sec_prot,coccurlist,occurlist,dflines=create_pandas_dataframe_from_text(blob,selectedTerms,selectedTerms,titlename)
# print len(sec_prot.nodes()), sec_prot.nodes()
# dfst.sort_values(by='Frequencies').sort(["Frequencies"], ascending=[0])

prot_pol_sub=dflines[['protagonists','#_of_protagonists','polarity','subjectivity']].reset_index()
prot_pol_sub['sentence_id']=prot_pol_sub.index
prot_pol_sub=prot_pol_sub[['sentence_id','protagonists','#_of_protagonists','polarity','subjectivity']]

cuts = 1
prot_pol_sub = prot_pol_sub[prot_pol_sub['#_of_protagonists']>cuts]
# lp = prot_pol_sub['protagonists'].tolist()
# lpn = []
# control_dic={}
# for i in lp:
#     for j in i:
#         lpn.append(j)
#         if j not in control_dic:
#             print j,'0, 1, 2'
#             control_dic[j]=int(raw_input())
# # print lpn
# # print control_dic
# # len(set(lpn))
# for nd in sec_prot.nodes():
#     if nd not in control_dic:
#         continue
#     dici=sec_prot.node[nd]
#     sec_prot.add_node(nd,attr_dic=dici,type=control_dic[nd])
print "The total number of sentences in %s with at least %i selected noun phrases in each one of them is %i." %(titlename,cuts+1,len(prot_pol_sub))
prot_pol_sub.rename(columns={'protagonists':'list_of_selected_noun_phrases','#_of_protagonists':'#_of_selected_noun_phrases'},inplace=True)
prot_pol_sub.sort(["#_of_selected_noun_phrases"], ascending=[0]) #.drop('sentence_id', 1)
ddff = prot_pol_sub.drop('sentence_id', 1)
ddff.index.name = 'sentence_id'
ddff

The total number of sentences in Kirby's The Hero of Esthonia with at least 2 selected noun phrases in each one of them is 67.

from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset

ndfl=dflines[dflines['#_of_protagonists']>0  ]

fig, ax = plt.subplots(figsize=[12, 10])
axes2 = zoomed_inset_axes(ax, 6, loc=5)  # zoom = 6

dflines['#_of_protagonists'].plot.hist(ax=ax)

ax.set_xlabel('#_of_Characters')
ax.set_ylabel('Frequency')
ax.set_title('Histogram of # of noun phrases')

x1, x2, y1, y2 = 2.9, 3., 0, 25
axes2.set_xlim(x1, x2)
axes2.set_ylim(y1, y2)
ndfl['#_of_protagonists'].plot.hist(ax=axes2)
axes2.set_ylabel('Frequency')

mark_inset(ax, axes2, loc1=2, loc2=4, fc="none", ec="0.5")
axes3 = zoomed_inset_axes(ax, 6, loc=10)

x1, x2, y1, y2 = 2, 2.05, 0, 50
axes3.set_xlim(x1, x2)
axes3.set_ylim(y1, y2)
ndfl['#_of_protagonists'].plot.hist(ax=axes3)
axes3.set_ylabel('Frequency')

mark_inset(ax, axes3, loc1=2, loc2=4, fc="none", ec="0.5")

plt.show()

%autoreload 2

draw_network_node_color=tool.draw_network_node_color
sstt="%s Two-Mode Network of Sentences and Selected Noun Phrases" %titlename
pos=nx.spring_layout(sec_prot)
# pos=DefaultDict
nds=[nd for nd in sec_prot.nodes() if isinstance(nd,int)]
prot=[nd for nd in sec_prot.nodes() if nd not in nds]
# ncont=dict(control_dic)
# print ncont
# ncont[u'Midas']=0
# prot.append('Midas')
# protag=[nd for nd in prot if int(ncont[nd])==0 ]
# pos={nd:{0:0,1:0} for nd in sec_prot}
# nprotag=[nd for nd in prot if  int(ncont[nd])==1 or int(ncont[nd])==2]
# print protag
# print nprotag
for en,nd in enumerate(nds):
    if en<len(nds)/2.:
        pos[nd][0]=-1
        pos[nd][1]=en*2./len(nds)
    else:
        pos[nd][0]=1
        pos[nd][1]=(en-len(nds)/2.)*2./len(nds)
for en ,nd in enumerate(prot):
    pos[nd][0]=0
    pos[nd][1]=en*1./len(prot)
# for en ,nd in enumerate(protag):
#     pos[nd][0]=.5
#     pos[nd][1]=en*1./len(protag)
# for en ,nd in enumerate(nprotag):
#     pos[nd][0]=-.5
#     pos[nd][1]=en*1./len(nprotag)
# print pos    
possit=draw_network_node_color(sec_prot,sstt,pos=pos,with_edgewidth=False,withLabels=True,labfs=12,valpha=0.2,
                               ealpha=0.4,labelfont=15,with_node_weight=False,node_size_fixer=300.,node_col='polarity')

possit=draw_network_node_color(sec_prot,sstt,pos=pos,with_edgewidth=False,withLabels=True,labfs=12,valpha=0.2,
                               ealpha=0.4,labelfont=15,with_node_weight=False,node_size_fixer=300.,
                               node_col='subjectivity',colormat='Greens')

III. Constructing the Network of Sententially Co-Occurring Noun Phrases in Kirby's Kalevipoeg¶

%autoreload 2

plist = prot_pol_sub['list_of_selected_noun_phrases'].tolist()
pplist=prot_pol_sub['polarity'].tolist()
nplist=prot_pol_sub['#_of_selected_noun_phrases'].tolist()
splist=prot_pol_sub['subjectivity'].tolist()

G = tool.make_graph_from_lists(plist,pplist,nplist,splist)
# print G.nodes(data=True)
# for nd in G.nodes():
#     G.add_node(nd,type=control_dic[nd])
posg=nx.spring_layout(G,scale=50)#,k=0.55)#,iterations=20)
# nodescolor={}
# npos={}
# d=40
# for nd in G.nodes():
#     opos=posg[nd]

#     if nd in protag:
#         nodescolor[nd]='r'
#         npos[nd]=[opos[0],opos[1]-d]
#     else:
#         nodescolor[nd]='g'
#         npos[nd]=[opos[0],opos[1]+d]
# npos={v:k for v,k in pos.items() if v in G}        

# sstt="%s Network of Selected Noun Phrases \n(Assortativity coefficient of Persons - Entities = %.4f)" %(titlename,nx.attribute_assortativity_coefficient(G,'type'))
sstt="%s Network of Selected Noun Phrases \n(Sentences colored in polarity)" %titlename
possit=tool.draw_network(G,sstt,pos=posg,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
                   with_edgecolor=True,edgecolor='polarity',colormat='Blues') #npos ,node_col=nodescolor False

sstt="%s Network of Selected Noun Phrases \n(Sentences colored in subjectivity)" %titlename
possit=tool.draw_network(G,sstt,pos=posg,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
                   with_edgecolor=True,edgecolor='subjectivity',colormat='Greys')

IV. Centralities of Nodes in the Network of Sententially Co-Occurring Noun Phrases in Kirby's Kalevipoeg¶

centrali=tool.draw_centralities_subplots(G,pos=posg,withLabels=False,labfs=5,figsi=(15,22),ealpha=1,vals=True)

The table of Centralities of Nodes in the Network of Sententially Co-Occurring Noun Phrases in Kirby's Kalevipoeg¶

dfc=pd.DataFrame()
u=0
for i,k in centrali.items():
    dfc.insert(u,i,k.values())
    u+=1
dfc.insert(0,'Nodes',centrali[centrali.keys()[0]].keys())
# dfc
# dfc.insert(0,'Nodes',centrali[centrali.keys()[0]].keys())
dfc.columns=['Nodes','Closeness_Centrality', 'Katz_Centrality','Betweenness_Centrality', 'PageRank', 'Eigenvector_Centrality','Degree_Centrality']
dfc=dfc[['Nodes','Degree_Centrality','Closeness_Centrality','Betweenness_Centrality','Eigenvector_Centrality','Katz_Centrality','PageRank']]

# cols = list(dfc.columns.values)
# print "Centralities of nodes of %s sorted by closeness and betweenness centralities:" %name 
# print
dfc.sort(['Betweenness_Centrality','Closeness_Centrality'], ascending=[0,0])

V. Communities of Nodes in the Network of Sententially Co-Occurring Noun Phrases in Kirby's Kalevipoeg¶

%autoreload 2

part,nodper=tool.print_communities(G,sstt)

d=0.8 
dd=0.8
c=1.2
cc=1.4
alpha=0.2
ealpha=0.2
vcc={}
sstta="The %s Communities of %s Network of Selected Noun Phrases" %(max(part.values())+1,titlename)#sstt)

tool.draw_comms(G,G.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,sstta,titlefont=20,labelfont=17,valpha=0.5)

Number of communities of Kirby's The Hero of Esthonia Network of Selected Noun Phrases 
(Sentences colored in subjectivity) = 3
Community partition of Kirby's The Hero of Esthonia Network of Selected Noun Phrases 
(Sentences colored in subjectivity):
[[u'P\xf5rgu', u'Olev', u'Alevide', u'Finland', u'Sarvik', u'Esthonia', u'Taara', u'T\xfchi', u'Kalevide'], [u'Salme', u'Old woman', u'Kalev', u'Linda'], [u'Finnish', u'Esthonian']]
Community modularity of Kirby's The Hero of Esthonia Network of Selected Noun Phrases 
(Sentences colored in subjectivity) = 0.3560

	Kirby's The Hero of Esthonia Noun Phrases	Frequencies
570	kalevide	162.0
539	footnote	95.0
629	kalev	50.0
592	sarvik	44.0
567	linda	39.0
439	finland	21.0
865	alevide	19.0
1035	taara	19.0
851	esthonia	18.0
821	salme	17.0
676	runo	14.0
162	esthonian	14.0
1026	põrgu	13.0
208	olev	11.0
540	old woman	11.0
270	tühi	11.0
957	finnish	11.0

	list_of_selected_noun_phrases	#_of_selected_noun_phrases	polarity	subjectivity
sentence_id
2	[Kalev, Esthonia]	2.0	0.325000	0.437500
11	[Linda, Salme]	2.0	0.100000	0.500000
33	[Linda, Salme]	2.0	0.475000	0.900000
50	[Kalev, Linda]	2.0	0.106250	0.500000
51	[Linda, Salme, Esthonia]	3.0	0.237500	0.377778
54	[Linda, Salme]	2.0	0.650000	0.350000
55	[Kalev, Linda, Salme]	3.0	0.050000	0.600000
57	[Kalev, Linda]	2.0	0.000000	0.000000
62	[Esthonian, Finnish]	2.0	0.000000	0.000000
69	[Esthonia, Finland]	2.0	0.000000	0.000000
70	[Linda, Salme]	2.0	0.500000	0.500000
77	[Esthonian, Finnish]	2.0	0.000000	0.100000
95	[Kalev, Linda]	2.0	0.208333	0.733333
98	[Kalev, Linda]	2.0	-0.238095	0.407937
113	[Kalev, Linda]	2.0	0.166667	0.533333
126	[Kalev, Linda]	2.0	0.031250	0.537500
127	[Kalev, Finland]	2.0	0.200000	0.483333
131	[Kalev, Linda]	2.0	0.000000	0.500000
142	[Esthonia, Finland]	2.0	-0.125000	0.375000
158	[Kalev, Finnish]	2.0	0.003125	0.475000
204	[Finland, Kalevide]	2.0	0.250000	0.750000
211	[Finnish, Kalevide]	2.0	-0.333333	0.533333
213	[Kalev, Linda]	2.0	-0.100000	0.100000
277	[Esthonian, Finnish]	2.0	0.800000	0.750000
284	[Finland, Kalevide]	2.0	0.100000	0.450000
321	[Kalev, Old woman]	2.0	0.050000	0.350000
325	[Esthonia, Finland]	2.0	0.337500	0.562500
368	[Esthonian, Finnish]	2.0	0.500000	0.500000
430	[Esthonia, Kalevide]	2.0	0.000000	0.000000
471	[Kalev, Finland, Alevide]	3.0	0.000000	0.000000
...	...	...	...	...
697	[Old woman, Sarvik]	2.0	0.100000	0.200000
699	[Finland, Kalevide]	2.0	0.300000	0.533333
741	[Alevide, Sarvik]	2.0	0.000000	0.125000
742	[Tühi, Kalevide]	2.0	0.100000	1.000000
744	[Tühi, Kalevide]	2.0	0.000000	0.000000
776	[Olev, Sarvik, Kalevide]	3.0	0.387500	0.375000
798	[Finland, Kalevide]	2.0	0.580000	0.810000
826	[Alevide, Kalevide]	2.0	-0.155556	0.288889
843	[Põrgu, Kalevide]	2.0	0.700000	0.900000
844	[Olev, Kalevide]	2.0	0.000000	0.000000
866	[Esthonia, Kalevide]	2.0	0.000000	0.500000
909	[Kalev, Sarvik]	2.0	-0.100000	0.250000
923	[Sarvik, Kalevide]	2.0	0.000000	0.000000
927	[Sarvik, Kalevide]	2.0	0.600000	1.000000
928	[Sarvik, Kalevide]	2.0	-0.198750	0.340000
929	[Sarvik, Kalevide]	2.0	0.100000	0.355556
936	[Sarvik, Kalevide]	2.0	0.000000	0.000000
952	[Taara, Kalevide]	2.0	0.000000	0.100000
964	[Olev, Kalevide]	2.0	0.450000	0.850000
973	[Esthonia, Kalevide]	2.0	0.100000	0.100000
976	[Alevide, Kalevide]	2.0	-0.200000	0.500000
977	[Taara, Kalevide]	2.0	-0.057143	0.360000
979	[Kalev, Alevide]	2.0	0.000000	0.000000
983	[Taara, Kalevide]	2.0	0.000000	0.000000
994	[Olev, Kalevide]	2.0	0.000000	0.000000
997	[Alevide, Kalevide]	2.0	0.000000	0.000000
1000	[Olev, Alevide, Kalevide]	3.0	0.080000	0.303333
1003	[Olev, Kalevide]	2.0	-0.400000	0.133333
1026	[Põrgu, Sarvik]	2.0	0.300000	0.497222
1027	[Taara, Kalevide]	2.0	-0.155556	0.288889

	Nodes	Degree_Centrality	Closeness_Centrality	Betweenness_Centrality	Eigenvector_Centrality	Katz_Centrality	PageRank
14	Kalevide	0.714286	0.777778	0.388278	0.590116	-0.099263	0.209528
9	Kalev	0.642857	0.736842	0.304029	0.317680	-0.189511	0.111678
4	Finnish	0.214286	0.560000	0.142857	0.066242	-0.145640	0.059073
8	Sarvik	0.500000	0.636364	0.113553	0.343092	-0.131465	0.084322
10	Esthonia	0.357143	0.583333	0.053114	0.226763	0.313355	0.055613
3	Alevide	0.357143	0.583333	0.018315	0.257093	-0.301661	0.056390
0	Põrgu	0.214286	0.500000	0.009158	0.066157	0.529898	0.028616
6	Finland	0.285714	0.560000	0.003663	0.333458	0.136168	0.071606
1	Olev	0.214286	0.500000	0.000000	0.270035	-0.284899	0.051370
7	Old woman	0.142857	0.500000	0.000000	0.021599	0.145424	0.020509
11	Taara	0.142857	0.482759	0.000000	0.176304	0.019232	0.035868
2	Salme	0.214286	0.466667	0.000000	0.134895	0.116163	0.049501
12	Tühi	0.142857	0.466667	0.000000	0.081280	0.542074	0.028285
13	Linda	0.214286	0.466667	0.000000	0.254018	-0.082157	0.091775
5	Esthonian	0.071429	0.368421	0.000000	0.043090	-0.102180	0.045865