from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
import random
import nltk
import codecs
from textblob import TextBlob
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import imp
from ipywidgets import widgets
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
filename = 'Kalevala.txt'
titlename = "Kalevala"
f = codecs.open(filename, "r", encoding="utf-8").readlines()
num_lines = 0
num_words = 0
num_chars = 0
for line in f:
words = line.split()
num_lines += 1
num_words += len(words)
num_chars += len(line)
print "%s has number of words = %i (and number of characters/symbols = %i)" %(titlename,num_words,num_chars)
# print "%s has number of words = %i" %(titlename,num_words)
blob = TextBlob("\n".join(f))
# nltk.download()
all_sents=blob.sentences
occurdic=Counter()
for sen in all_sents:
dd=sen.dict
for np in dd['noun_phrases']:
occurdic[np]+=1
df = pd.DataFrame(columns=["%s Noun Phrases" %titlename, "Frequencies"])
u=1
for l,v in occurdic.items():
df.loc[u]=[l,v]
u+=1
print "The total number of noun phrases in %s is %i." %(titlename,len(df))#len(npA))
df.sort(["Frequencies"], ascending=[0])
cut = 90
df = df[df['Frequencies']>cut].sort(["Frequencies"], ascending=[0])
print "The total number of noun phrases in %s with frequencies > %i is %i." %(titlename,cut,len(df))#len(npA))
df.sort(["Frequencies"], ascending=[0])
excluded = ['thou','spake','never','thereupon','quick','till','who']
%autoreload 2
selectedTerms={}
# excluded = ['who','will','exactly','enough','shall','suppose','well']
for k in df["Kalevala Noun Phrases"].tolist(): #df["Plato's Phaedrus Noun Phrases"].tolist():
if k not in excluded:
selectedTerms[k] = k.capitalize()
# tool= imp.load_source('tools', utilsdir+'/tools.py')
import tools as tool
create_pandas_dataframe_from_text=tool.create_pandas_dataframe_from_text
dfst,sec_prot,coccurlist,occurlist,dflines=create_pandas_dataframe_from_text(blob,selectedTerms,selectedTerms,titlename)
# print len(sec_prot.nodes()), sec_prot.nodes()
# dfst.sort_values(by='Frequencies').sort(["Frequencies"], ascending=[0])
prot_pol_sub=dflines[['protagonists','#_of_protagonists','polarity','subjectivity']].reset_index()
prot_pol_sub['sentence_id']=prot_pol_sub.index
prot_pol_sub=prot_pol_sub[['sentence_id','protagonists','#_of_protagonists','polarity','subjectivity']]
cuts = 1
prot_pol_sub = prot_pol_sub[prot_pol_sub['#_of_protagonists']>cuts]
# lp = prot_pol_sub['protagonists'].tolist()
# lpn = []
# control_dic={}
# for i in lp:
# for j in i:
# lpn.append(j)
# if j not in control_dic:
# print j,'0, 1, 2'
# control_dic[j]=int(raw_input())
# # print lpn
# # print control_dic
# # len(set(lpn))
# for nd in sec_prot.nodes():
# if nd not in control_dic:
# continue
# dici=sec_prot.node[nd]
# sec_prot.add_node(nd,attr_dic=dici,type=control_dic[nd])
print "The total number of sentences in %s with at least %i selected noun phrases in each one of them is %i." %(titlename,cuts+1,len(prot_pol_sub))
prot_pol_sub.rename(columns={'protagonists':'list_of_selected_noun_phrases','#_of_protagonists':'#_of_selected_noun_phrases'},inplace=True)
prot_pol_sub.sort(["#_of_selected_noun_phrases"], ascending=[0]) #.drop('sentence_id', 1)
ddff = prot_pol_sub.drop('sentence_id', 1)
ddff.index.name = 'sentence_id'
ddff
# from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
# from mpl_toolkits.axes_grid1.inset_locator import mark_inset
# ndfl=dflines[dflines['#_of_protagonists']>0 ]
# fig, ax = plt.subplots(figsize=[12, 10])
# axes2 = zoomed_inset_axes(ax, 6, loc=5) # zoom = 6
# dflines['#_of_protagonists'].plot.hist(ax=ax)
# ax.set_xlabel('#_of_Characters')
# ax.set_ylabel('Frequency')
# ax.set_title('Histogram of # of noun phrases')
# x1, x2, y1, y2 = 2.9, 3., 0, 25
# axes2.set_xlim(x1, x2)
# axes2.set_ylim(y1, y2)
# ndfl['#_of_protagonists'].plot.hist(ax=axes2)
# axes2.set_ylabel('Frequency')
# mark_inset(ax, axes2, loc1=2, loc2=4, fc="none", ec="0.5")
# axes3 = zoomed_inset_axes(ax, 6, loc=10)
# x1, x2, y1, y2 = 2, 2.05, 0, 50
# axes3.set_xlim(x1, x2)
# axes3.set_ylim(y1, y2)
# ndfl['#_of_protagonists'].plot.hist(ax=axes3)
# axes3.set_ylabel('Frequency')
# mark_inset(ax, axes3, loc1=2, loc2=4, fc="none", ec="0.5")
# plt.show()
%autoreload 2
draw_network_node_color=tool.draw_network_node_color
sstt="%s Two-Mode Network of Sentences and Selected Noun Phrases" %titlename
pos=nx.spring_layout(sec_prot)
# pos=DefaultDict
nds=[nd for nd in sec_prot.nodes() if isinstance(nd,int)]
prot=[nd for nd in sec_prot.nodes() if nd not in nds]
# ncont=dict(control_dic)
# print ncont
# ncont[u'Midas']=0
# prot.append('Midas')
# protag=[nd for nd in prot if int(ncont[nd])==0 ]
# pos={nd:{0:0,1:0} for nd in sec_prot}
# nprotag=[nd for nd in prot if int(ncont[nd])==1 or int(ncont[nd])==2]
# print protag
# print nprotag
for en,nd in enumerate(nds):
if en<len(nds)/2.:
pos[nd][0]=-1
pos[nd][1]=en*2./len(nds)
else:
pos[nd][0]=1
pos[nd][1]=(en-len(nds)/2.)*2./len(nds)
for en ,nd in enumerate(prot):
pos[nd][0]=0
pos[nd][1]=en*1./len(prot)
# for en ,nd in enumerate(protag):
# pos[nd][0]=.5
# pos[nd][1]=en*1./len(protag)
# for en ,nd in enumerate(nprotag):
# pos[nd][0]=-.5
# pos[nd][1]=en*1./len(nprotag)
# print pos
possit=draw_network_node_color(sec_prot,sstt,pos=pos,with_edgewidth=False,withLabels=True,labfs=12,valpha=0.2,
ealpha=0.4,labelfont=15,with_node_weight=False,node_size_fixer=300.,node_col='polarity')
possit=draw_network_node_color(sec_prot,sstt,pos=pos,with_edgewidth=False,withLabels=True,labfs=12,valpha=0.2,
ealpha=0.4,labelfont=15,with_node_weight=False,node_size_fixer=300.,
node_col='subjectivity',colormat='Greens')
%autoreload 2
plist = prot_pol_sub['list_of_selected_noun_phrases'].tolist()
pplist=prot_pol_sub['polarity'].tolist()
nplist=prot_pol_sub['#_of_selected_noun_phrases'].tolist()
splist=prot_pol_sub['subjectivity'].tolist()
# G = tool.make_graph_from_lists(plist,pplist,nplist,splist)
# an thes to palio to apo pano
G = tool.make_graph_from_lists_log(plist,pplist,nplist,splist)
# print G.nodes(data=True)
# for nd in G.nodes():
# G.add_node(nd,type=control_dic[nd])
posg=nx.spring_layout(G,scale=50)#,k=0.55)#,iterations=20)
# nodescolor={}
# npos={}
# d=40
# for nd in G.nodes():
# opos=posg[nd]
# if nd in protag:
# nodescolor[nd]='r'
# npos[nd]=[opos[0],opos[1]-d]
# else:
# nodescolor[nd]='g'
# npos[nd]=[opos[0],opos[1]+d]
# npos={v:k for v,k in pos.items() if v in G}
# sstt="%s Network of Selected Noun Phrases \n(Assortativity coefficient of Persons - Entities = %.4f)" %(titlename,nx.attribute_assortativity_coefficient(G,'type'))
sstt="%s Network of Selected Noun Phrases \n(Sentences colored in polarity)" %titlename
possit=tool.draw_network(G,sstt,pos=posg,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
with_edgecolor=True,edgecolor='polarity',colormat='Blues') #npos ,node_col=nodescolor False
# possit=tool.draw_network(G,sstt,pos=posg,with_edgewidth=False,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
# with_edgecolor=True,edgecolor='polarity',colormat='Blues') #npos ,node_col=nodescolor False
sstt="%s Network of Selected Noun Phrases \n(Sentences colored in subjectivity)" %titlename
possit=tool.draw_network(G,sstt,pos=posg,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
with_edgecolor=True,edgecolor='subjectivity',colormat='Greys')
centrali=tool.draw_centralities_subplots(G,pos=posg,withLabels=False,labfs=5,figsi=(15,22),ealpha=1,vals=True)
dfc=pd.DataFrame()
u=0
for i,k in centrali.items():
dfc.insert(u,i,k.values())
u+=1
dfc.insert(0,'Nodes',centrali[centrali.keys()[0]].keys())
# dfc
# dfc.insert(0,'Nodes',centrali[centrali.keys()[0]].keys())
dfc.columns=['Nodes','Closeness_Centrality', 'Katz_Centrality','Betweenness_Centrality', 'PageRank', 'Eigenvector_Centrality','Degree_Centrality']
dfc=dfc[['Nodes','Degree_Centrality','Closeness_Centrality','Betweenness_Centrality','Eigenvector_Centrality','Katz_Centrality','PageRank']]
# cols = list(dfc.columns.values)
# print "Centralities of nodes of %s sorted by closeness and betweenness centralities:" %name
# print
dfc.sort(['Betweenness_Centrality','Closeness_Centrality'], ascending=[0,0])
%autoreload 2
part,nodper=tool.print_communities(G,sstt)
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.2
ealpha=0.2
vcc={}
sstta="The %s Communities of %s Network of Selected Noun Phrases" %(max(part.values())+1,titlename)#sstt)
tool.draw_comms(G,G.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,sstta,titlefont=20,labelfont=17,valpha=0.5)