The Network among Sentential Co-Occurrences

of Noun Phrases in

Kirby's "The Hero of Esthonia"

By the DH in Estonia 2016 Conference Team

In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[1]:

Importing Python Modules

In [1]:
import random
import nltk
import codecs
from textblob import TextBlob
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import imp
from ipywidgets import widgets
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline 
%load_ext autoreload

I. Importing the Text of Kirby's The Hero of Esthonia

In [2]:
filename = 'Kirby1.txt'
titlename = "Kirby's The Hero of Esthonia"

f = codecs.open(filename, "r", encoding="utf-8").readlines()

num_lines = 0
num_words = 0
num_chars = 0
for line in f:
    words = line.split()
    num_lines += 1
    num_words += len(words)
    num_chars += len(line)
print "%s has number of words = %i (and number of characters/symbols = %i)" %(titlename,num_words,num_chars)
# print "%s has number of words = %i" %(titlename,num_words)
blob = TextBlob("\n".join(f))
Kirby's The Hero of Esthonia has number of words = 27692 (and number of characters/symbols = 152768)

II. Extracting the Most Frequent Noun Phrases in Kirby's Kalevipoeg

In [3]:
# nltk.download()
In [4]:
all_sents=blob.sentences
occurdic=Counter()
for sen in all_sents:
    dd=sen.dict
    for np in dd['noun_phrases']:
        occurdic[np]+=1

df = pd.DataFrame(columns=["%s Noun Phrases" %titlename, "Frequencies"])
u=1
for l,v in occurdic.items(): 
    df.loc[u]=[l,v]
    u+=1

print "The total number of noun phrases in %s is %i." %(titlename,len(df))#len(npA))
df.sort(["Frequencies"], ascending=[0])

cut = 10
df = df[df['Frequencies']>cut].sort(["Frequencies"], ascending=[0])
print "The total number of noun phrases in %s with frequencies > %i is %i." %(titlename,cut,len(df))#len(npA))
df.sort(["Frequencies"], ascending=[0])
The total number of noun phrases in Kirby's The Hero of Esthonia is 1191.
The total number of noun phrases in Kirby's The Hero of Esthonia with frequencies > 10 is 17.
Out[4]:
Kirby's The Hero of Esthonia Noun Phrases Frequencies
570 kalevide 162.0
539 footnote 95.0
629 kalev 50.0
592 sarvik 44.0
567 linda 39.0
439 finland 21.0
865 alevide 19.0
1035 taara 19.0
851 esthonia 18.0
821 salme 17.0
676 runo 14.0
162 esthonian 14.0
1026 põrgu 13.0
208 olev 11.0
540 old woman 11.0
270 tühi 11.0
957 finnish 11.0
In [5]:
excluded = ['runo','footnote']
In [6]:
%autoreload 2

selectedTerms={}
# excluded = ['who','will','exactly','enough','shall','suppose','well']
for k in df["Kirby's The Hero of Esthonia Noun Phrases"].tolist(): #df["Plato's Phaedrus Noun Phrases"].tolist():
    if k not in excluded:
        selectedTerms[k] = k.capitalize()

# tool= imp.load_source('tools', utilsdir+'/tools.py')
import tools as tool
create_pandas_dataframe_from_text=tool.create_pandas_dataframe_from_text  
dfst,sec_prot,coccurlist,occurlist,dflines=create_pandas_dataframe_from_text(blob,selectedTerms,selectedTerms,titlename)
# print len(sec_prot.nodes()), sec_prot.nodes()
# dfst.sort_values(by='Frequencies').sort(["Frequencies"], ascending=[0])

prot_pol_sub=dflines[['protagonists','#_of_protagonists','polarity','subjectivity']].reset_index()
prot_pol_sub['sentence_id']=prot_pol_sub.index
prot_pol_sub=prot_pol_sub[['sentence_id','protagonists','#_of_protagonists','polarity','subjectivity']]

cuts = 1
prot_pol_sub = prot_pol_sub[prot_pol_sub['#_of_protagonists']>cuts]
# lp = prot_pol_sub['protagonists'].tolist()
# lpn = []
# control_dic={}
# for i in lp:
#     for j in i:
#         lpn.append(j)
#         if j not in control_dic:
#             print j,'0, 1, 2'
#             control_dic[j]=int(raw_input())
# # print lpn
# # print control_dic
# # len(set(lpn))
# for nd in sec_prot.nodes():
#     if nd not in control_dic:
#         continue
#     dici=sec_prot.node[nd]
#     sec_prot.add_node(nd,attr_dic=dici,type=control_dic[nd])
print "The total number of sentences in %s with at least %i selected noun phrases in each one of them is %i." %(titlename,cuts+1,len(prot_pol_sub))
prot_pol_sub.rename(columns={'protagonists':'list_of_selected_noun_phrases','#_of_protagonists':'#_of_selected_noun_phrases'},inplace=True)
prot_pol_sub.sort(["#_of_selected_noun_phrases"], ascending=[0]) #.drop('sentence_id', 1)
ddff = prot_pol_sub.drop('sentence_id', 1)
ddff.index.name = 'sentence_id'
ddff
The total number of sentences in Kirby's The Hero of Esthonia with at least 2 selected noun phrases in each one of them is 67.
Out[6]:
list_of_selected_noun_phrases #_of_selected_noun_phrases polarity subjectivity
sentence_id
2 [Kalev, Esthonia] 2.0 0.325000 0.437500
11 [Linda, Salme] 2.0 0.100000 0.500000
33 [Linda, Salme] 2.0 0.475000 0.900000
50 [Kalev, Linda] 2.0 0.106250 0.500000
51 [Linda, Salme, Esthonia] 3.0 0.237500 0.377778
54 [Linda, Salme] 2.0 0.650000 0.350000
55 [Kalev, Linda, Salme] 3.0 0.050000 0.600000
57 [Kalev, Linda] 2.0 0.000000 0.000000
62 [Esthonian, Finnish] 2.0 0.000000 0.000000
69 [Esthonia, Finland] 2.0 0.000000 0.000000
70 [Linda, Salme] 2.0 0.500000 0.500000
77 [Esthonian, Finnish] 2.0 0.000000 0.100000
95 [Kalev, Linda] 2.0 0.208333 0.733333
98 [Kalev, Linda] 2.0 -0.238095 0.407937
113 [Kalev, Linda] 2.0 0.166667 0.533333
126 [Kalev, Linda] 2.0 0.031250 0.537500
127 [Kalev, Finland] 2.0 0.200000 0.483333
131 [Kalev, Linda] 2.0 0.000000 0.500000
142 [Esthonia, Finland] 2.0 -0.125000 0.375000
158 [Kalev, Finnish] 2.0 0.003125 0.475000
204 [Finland, Kalevide] 2.0 0.250000 0.750000
211 [Finnish, Kalevide] 2.0 -0.333333 0.533333
213 [Kalev, Linda] 2.0 -0.100000 0.100000
277 [Esthonian, Finnish] 2.0 0.800000 0.750000
284 [Finland, Kalevide] 2.0 0.100000 0.450000
321 [Kalev, Old woman] 2.0 0.050000 0.350000
325 [Esthonia, Finland] 2.0 0.337500 0.562500
368 [Esthonian, Finnish] 2.0 0.500000 0.500000
430 [Esthonia, Kalevide] 2.0 0.000000 0.000000
471 [Kalev, Finland, Alevide] 3.0 0.000000 0.000000
... ... ... ... ...
697 [Old woman, Sarvik] 2.0 0.100000 0.200000
699 [Finland, Kalevide] 2.0 0.300000 0.533333
741 [Alevide, Sarvik] 2.0 0.000000 0.125000
742 [Tühi, Kalevide] 2.0 0.100000 1.000000
744 [Tühi, Kalevide] 2.0 0.000000 0.000000
776 [Olev, Sarvik, Kalevide] 3.0 0.387500 0.375000
798 [Finland, Kalevide] 2.0 0.580000 0.810000
826 [Alevide, Kalevide] 2.0 -0.155556 0.288889
843 [Põrgu, Kalevide] 2.0 0.700000 0.900000
844 [Olev, Kalevide] 2.0 0.000000 0.000000
866 [Esthonia, Kalevide] 2.0 0.000000 0.500000
909 [Kalev, Sarvik] 2.0 -0.100000 0.250000
923 [Sarvik, Kalevide] 2.0 0.000000 0.000000
927 [Sarvik, Kalevide] 2.0 0.600000 1.000000
928 [Sarvik, Kalevide] 2.0 -0.198750 0.340000
929 [Sarvik, Kalevide] 2.0 0.100000 0.355556
936 [Sarvik, Kalevide] 2.0 0.000000 0.000000
952 [Taara, Kalevide] 2.0 0.000000 0.100000
964 [Olev, Kalevide] 2.0 0.450000 0.850000
973 [Esthonia, Kalevide] 2.0 0.100000 0.100000
976 [Alevide, Kalevide] 2.0 -0.200000 0.500000
977 [Taara, Kalevide] 2.0 -0.057143 0.360000
979 [Kalev, Alevide] 2.0 0.000000 0.000000
983 [Taara, Kalevide] 2.0 0.000000 0.000000
994 [Olev, Kalevide] 2.0 0.000000 0.000000
997 [Alevide, Kalevide] 2.0 0.000000 0.000000
1000 [Olev, Alevide, Kalevide] 3.0 0.080000 0.303333
1003 [Olev, Kalevide] 2.0 -0.400000 0.133333
1026 [Põrgu, Sarvik] 2.0 0.300000 0.497222
1027 [Taara, Kalevide] 2.0 -0.155556 0.288889

67 rows × 4 columns

In [7]:
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset

ndfl=dflines[dflines['#_of_protagonists']>0  ]

fig, ax = plt.subplots(figsize=[12, 10])
axes2 = zoomed_inset_axes(ax, 6, loc=5)  # zoom = 6

dflines['#_of_protagonists'].plot.hist(ax=ax)

ax.set_xlabel('#_of_Characters')
ax.set_ylabel('Frequency')
ax.set_title('Histogram of # of noun phrases')

x1, x2, y1, y2 = 2.9, 3., 0, 25
axes2.set_xlim(x1, x2)
axes2.set_ylim(y1, y2)
ndfl['#_of_protagonists'].plot.hist(ax=axes2)
axes2.set_ylabel('Frequency')

mark_inset(ax, axes2, loc1=2, loc2=4, fc="none", ec="0.5")
axes3 = zoomed_inset_axes(ax, 6, loc=10)

x1, x2, y1, y2 = 2, 2.05, 0, 50
axes3.set_xlim(x1, x2)
axes3.set_ylim(y1, y2)
ndfl['#_of_protagonists'].plot.hist(ax=axes3)
axes3.set_ylabel('Frequency')

mark_inset(ax, axes3, loc1=2, loc2=4, fc="none", ec="0.5")

plt.show()
In [8]:
%autoreload 2

draw_network_node_color=tool.draw_network_node_color
sstt="%s Two-Mode Network of Sentences and Selected Noun Phrases" %titlename
pos=nx.spring_layout(sec_prot)
# pos=DefaultDict
nds=[nd for nd in sec_prot.nodes() if isinstance(nd,int)]
prot=[nd for nd in sec_prot.nodes() if nd not in nds]
# ncont=dict(control_dic)
# print ncont
# ncont[u'Midas']=0
# prot.append('Midas')
# protag=[nd for nd in prot if int(ncont[nd])==0 ]
# pos={nd:{0:0,1:0} for nd in sec_prot}
# nprotag=[nd for nd in prot if  int(ncont[nd])==1 or int(ncont[nd])==2]
# print protag
# print nprotag
for en,nd in enumerate(nds):
    if en<len(nds)/2.:
        pos[nd][0]=-1
        pos[nd][1]=en*2./len(nds)
    else:
        pos[nd][0]=1
        pos[nd][1]=(en-len(nds)/2.)*2./len(nds)
for en ,nd in enumerate(prot):
    pos[nd][0]=0
    pos[nd][1]=en*1./len(prot)
# for en ,nd in enumerate(protag):
#     pos[nd][0]=.5
#     pos[nd][1]=en*1./len(protag)
# for en ,nd in enumerate(nprotag):
#     pos[nd][0]=-.5
#     pos[nd][1]=en*1./len(nprotag)
# print pos    
possit=draw_network_node_color(sec_prot,sstt,pos=pos,with_edgewidth=False,withLabels=True,labfs=12,valpha=0.2,
                               ealpha=0.4,labelfont=15,with_node_weight=False,node_size_fixer=300.,node_col='polarity')
In [9]:
possit=draw_network_node_color(sec_prot,sstt,pos=pos,with_edgewidth=False,withLabels=True,labfs=12,valpha=0.2,
                               ealpha=0.4,labelfont=15,with_node_weight=False,node_size_fixer=300.,
                               node_col='subjectivity',colormat='Greens')

III. Constructing the Network of Sententially Co-Occurring Noun Phrases in Kirby's Kalevipoeg

In [10]:
%autoreload 2

plist = prot_pol_sub['list_of_selected_noun_phrases'].tolist()
pplist=prot_pol_sub['polarity'].tolist()
nplist=prot_pol_sub['#_of_selected_noun_phrases'].tolist()
splist=prot_pol_sub['subjectivity'].tolist()

G = tool.make_graph_from_lists(plist,pplist,nplist,splist)
# print G.nodes(data=True)
# for nd in G.nodes():
#     G.add_node(nd,type=control_dic[nd])
posg=nx.spring_layout(G,scale=50)#,k=0.55)#,iterations=20)
# nodescolor={}
# npos={}
# d=40
# for nd in G.nodes():
#     opos=posg[nd]

#     if nd in protag:
#         nodescolor[nd]='r'
#         npos[nd]=[opos[0],opos[1]-d]
#     else:
#         nodescolor[nd]='g'
#         npos[nd]=[opos[0],opos[1]+d]
# npos={v:k for v,k in pos.items() if v in G}        

# sstt="%s Network of Selected Noun Phrases \n(Assortativity coefficient of Persons - Entities = %.4f)" %(titlename,nx.attribute_assortativity_coefficient(G,'type'))
sstt="%s Network of Selected Noun Phrases \n(Sentences colored in polarity)" %titlename
possit=tool.draw_network(G,sstt,pos=posg,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
                   with_edgecolor=True,edgecolor='polarity',colormat='Blues') #npos ,node_col=nodescolor False
In [11]:
sstt="%s Network of Selected Noun Phrases \n(Sentences colored in subjectivity)" %titlename
possit=tool.draw_network(G,sstt,pos=posg,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
                   with_edgecolor=True,edgecolor='subjectivity',colormat='Greys')