IMPORTANT: To use this notebook, you'll need to
ipython notebook
in the same directory where notebook and scripts were put
This work is licensed under a Creative Commons Attribution 4.0 International License.
import random
import nltk
import codecs
from textblob import TextBlob
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import os
import imp
# utilsdir='/home/sergios-len/Dropbox/Python Projects (1)/utils/tools.py'
utilsdir='/home/mab/Dropbox/Python Projects/utils/'#tools.py'
%matplotlib inline
%load_ext autoreload
filename = 'Texts/AStudyInScarlet.txt'
titlename = "Arthur Conan Doyle's A Study in Scarlet"
f = codecs.open(filename, "r", encoding="utf-8").read()
num_lines = 0
num_words = 0
num_chars = 0
for line in f:
words = line.split()
num_lines += 1
num_words += len(words)
num_chars += len(line)
print "%s has number of words = %i and number of characters = %i" %(titlename,num_words,num_chars)
blob = TextBlob(f)
dici={'Sherlock Holmes':'Sherlock Holmes', 'Mr. Sherlock Holmes':'Sherlock Holmes', 'Sherlock':'Sherlock Holmes',
'Holmes':'Sherlock Holmes',
'Dr. Watson':'Dr. Watson', 'Watson':'Dr. Watson',
'Lestrade':'Lestrade',
'Lucy Ferrier':'Lucy Ferrier', 'Lucy':'Lucy Ferrier',
'John Ferrier':'John Ferrier',
'John Rance':'John Rance', 'Rance':'John Rance',
'Arthur Charpentier':'Arthur Charpentier', 'Lieutenant Charpentier':'Arthur Charpentier',
'Mrs. Charpentier':'Mrs. Charpentier', 'Madame Charpentier':'Mrs. Charpentier',
'Enoch Drebber':'Enoch Drebber', 'Enoch':'Enoch Drebber', 'Drebber': 'Enoch Drebber',
'Jefferson Hope':'Jefferson Hope', 'Jefferson':'Jefferson Hope', 'Hope':'Jefferson Hope',
'Brigham Young':'Brigham Young', 'Brigham':'Brigham Young', 'Young': 'Brigham Young',
'Joseph Stangerson':'Joseph Stangerson', 'Joseph':'Joseph Stangerson', 'Stangerson': 'Joseph Stangerson',
'Tobias Gregson':'Tobias Gregson', 'Gregson':'Tobias Gregson',
'Stamford':'Stamford'
}
ndici={i.lower():k for i,k in dici.items()}
dnici=[(i.split()[0],i.split()[1]) for i in ndici.keys() if len(i.split())>1]
selectedTerms=ndici.keys()
%autoreload 2
tool= imp.load_source('tools', utilsdir+'tools.py')
create_pandas_dataframe_from_text=tool.create_pandas_dataframe_from_text
create_coo_graph=tool.create_coo_graph
dfst,sec_prot,coccurlist,occurlist,dflines=create_pandas_dataframe_from_text(blob,selectedTerms,ndici,titlename)
co_graph=create_coo_graph(coccurlist)
dfst.rename(columns={"Arthur Conan Doyle's A Study in Scarlet selected terms":"Arthur Conan Doyle's A Study in Scarlet Characters"},inplace=True)
dfst.sort_values(by='Frequencies').sort(["Frequencies"], ascending=[0])
prot_pol_sub=dflines[['protagonists','#_of_protagonists','polarity','subjectivity']].reset_index()
prot_pol_sub['sentence_id']=prot_pol_sub.index
prot_pol_sub=prot_pol_sub[['sentence_id','protagonists','#_of_protagonists','polarity','subjectivity']]
cuts = 1
prot_pol_sub = prot_pol_sub[prot_pol_sub['#_of_protagonists']>cuts]
lp = prot_pol_sub['protagonists'].tolist()
lpn = []
for i in lp:
for j in i:
lpn.append(j)
# len(set(lpn))
print "The total number of sentences in %s with at least %i characters in each one of them is %i." %(titlename,cuts+1,len(prot_pol_sub))
prot_pol_sub.rename(columns={'protagonists':'Lists_of_Characters','#_of_protagonists':'#_of_Characters','polarity':'Polarity','subjectivity':'Subjectivity'},inplace=True)
prot_pol_sub.sort(["#_of_Characters"], ascending=[0])
ddff = prot_pol_sub.drop('sentence_id', 1)
ddff.index.name = 'Sentence_ID'
ddff
prot_pol_sub[['#_of_Characters','Polarity','Subjectivity']].describe()
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
ndfl=dflines[dflines['#_of_protagonists']>0 ]
fig, ax = plt.subplots(figsize=[12, 10])
axes2 = zoomed_inset_axes(ax, 16, loc=7) # zoom = 6
dflines['#_of_protagonists'].plot.hist(ax=ax)
ax.set_xlabel('#_of_Characters')
ax.set_ylabel('Frequency')
ax.set_title('Histogram of # of characters')
x1, x2, y1, y2 = 2.95, 3., 0, 30
axes2.set_xlim(x1, x2)
axes2.set_ylim(y1, y2)
ndfl['#_of_protagonists'].plot.hist(ax=axes2)
axes2.set_ylabel('Frequency')
mark_inset(ax, axes2, loc1=2, loc2=4, fc="none", ec="0.5")
axes3 = zoomed_inset_axes(ax, 10, loc=10)
x1, x2, y1, y2 = 2, 2.1, 0, 60
axes3.set_xlim(x1, x2)
axes3.set_ylim(y1, y2)
ndfl['#_of_protagonists'].plot.hist(ax=axes3)
axes3.set_ylabel('Frequency')
mark_inset(ax, axes3, loc1=2, loc2=4, fc="none", ec="0.5")
plt.show()
%autoreload 2
from tools import draw_network_node_color
sstt="%s Two-Mode Network of Sentences and Characters" %titlename
pos=nx.spring_layout(sec_prot)
nds=[nd for nd in sec_prot.nodes() if isinstance(nd,int)]
prot=[nd for nd in sec_prot.nodes() if nd not in nds]
for en,nd in enumerate(nds):
if en<len(nds)/2.:
pos[nd][0]=-1
pos[nd][1]=en*2./len(nds)
else:
pos[nd][0]=1
pos[nd][1]=(en-len(nds)/2.)*2./len(nds)
for en ,nd in enumerate(prot):
pos[nd][0]=0
pos[nd][1]=en*1./len(prot)
possit=draw_network_node_color(sec_prot,sstt,pos=pos,with_edgewidth=False,withLabels=True,labfs=12,valpha=0.2,
ealpha=0.4,labelfont=15,with_node_weight=False,node_size_fixer=300.,node_col='polarity')
possit=draw_network_node_color(sec_prot,sstt,pos=pos,with_edgewidth=False,withLabels=True,labfs=12,valpha=0.2,
ealpha=0.4,labelfont=15,with_node_weight=False,node_size_fixer=300.,
node_col='subjectivity',colormat='Greens')
%autoreload 2
from tools import draw_network, make_graph_from_lists
plist = prot_pol_sub['Lists_of_Characters'].tolist()
pplist=prot_pol_sub['Polarity'].tolist()
nplist=prot_pol_sub['#_of_Characters'].tolist()
splist=prot_pol_sub['Subjectivity'].tolist()
G = make_graph_from_lists(plist,pplist,nplist,splist)
posg=nx.spring_layout(G,scale=50,k=0.55,iterations=20)
# posg=nx.spring_layout(G,scale=50)#,k=0.55)#,iterations=20)
sstt="%s Network of Selected Characters \n(Sentences colored in polarity)" %titlename
possit=draw_network(G,sstt,pos=posg,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
with_edgecolor=True,edgecolor='polarity',colormat='Blues')
sstt="%s Network of Selected Characters \n(Sentences Colored in Subjectivity)" %titlename
possit=draw_network(G,sstt,pos=posg,with_edgewidth=True,withLabels=True,labfs=15,valpha=0.2,ealpha=0.7,labelfont=15,
with_edgecolor=True,edgecolor='subjectivity',colormat='Greys')
from tools import draw_centralities_subplots
centrali=draw_centralities_subplots(G,pos=posg,withLabels=False,labfs=5,figsi=(15,22),ealpha=1,vals=True)
dfc=pd.DataFrame()
u=0
for i,k in centrali.items():
dfc.insert(u,i,k.values())
u+=1
dfc.insert(0,'Nodes',centrali[centrali.keys()[0]].keys())
dfc
%autoreload 2
from tools import draw_comms, modul_arity, print_communities
part,nodper=print_communities(G,sstt)
ndfl=dflines[dflines['#_of_protagonists']>0 ]
# ndfl['#_of_protagonists'].plot.hist()
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.2
ealpha=0.2
vcc={}
sstta="The %s Communities of %s Network of Characters" %(max(part.values())+1,titlename)#sstt)
draw_comms(G,G.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,sstta,titlefont=20,labelfont=17,valpha=0.5)
# %autoreload 2
# !pip install --user pygexf
# !pip install --user pyinterval
# !pip install --user pyinter
# !pip install --user python-igraph
# import trajectories as trj
trj= imp.load_source('trajectories', utilsdir+'trajectories_t.py')
# import create_gexf_year as cgy
def search_in_list(x):
l=x['protagonists']
return 'Sherlock Holmes' in l
ndfl=dflines[dflines['#_of_protagonists']>1 ]
dialogs=ndfl[ndfl.apply(search_in_list,axis=1) ==True]
protagonists=dialogs.protagonists.tolist()
start=range(1,len(protagonists)+1)
end=range(2,len(protagonists)+2)
polarities=dialogs.polarity.tolist()
subj=dialogs.subjectivity.tolist()
qq=0
figi=None
search_name='Sherlock Holmes'
G,ndls,pold,subjd=trj.creatTestGraph_pandas_bip(start,end,protagonists,search_name,polarities,subj)
trajpdfs=trj.main_work_search_name(G,ndls,qq,figi,search_name,verb=False,plot_first_mode=False)
trajpdfs["['Sherlock Holmes']"]
%autoreload 2
import igraph as ig
from tools import igraph_draw_traj
filname='S_out_graphs/Sherlock Holmes_graph.graphml'
g,visual_style,layout=igraph_draw_traj(filname,pold)
ig.plot(g, **visual_style)
g,visual_style,layout=igraph_draw_traj(filname,subjd,polar=False,layout=layout)
ig.plot(g, **visual_style)
def search_in_list(x):
l=x['protagonists']
return 'Lestrade' in l
ndfl=dflines[dflines['#_of_protagonists']>1 ]
dialogs=ndfl[ndfl.apply(search_in_list,axis=1) ==True]
protagonists=dialogs.protagonists.tolist()
start=range(1,len(protagonists)+1)
end=range(2,len(protagonists)+2)
polarities=dialogs.polarity.tolist()
subj=dialogs.subjectivity.tolist()
qq=0
figi=None
search_name='Lestrade'
G,ndls,pold,subjd=trj.creatTestGraph_pandas_bip(start,end,protagonists,search_name,polarities,subj)
trajpdfs=trj.main_work_search_name(G,ndls,qq,figi,search_name,verb=False,plot_first_mode=False)
trajpdfs["['Lestrade']"]
%autoreload 2
import igraph as ig
from tools import igraph_draw_traj
filname='S_out_graphs/%s_graph.graphml' %search_name
g,visual_style,layout=igraph_draw_traj(filname,pold)
ig.plot(g, **visual_style)
g,visual_style,layout=igraph_draw_traj(filname,subjd,polar=False,layout=layout)
ig.plot(g, **visual_style)