%matplotlib inline
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib.pyplot as plt
import matplotlib as mpl
from lightning import Lightning
import pandas as pd
import random
import warnings
import seaborn as sns
sns.set_style("white")
sns.set_style("ticks")
from tools import draw_centralities, draw_centralities_subplots, draw_centralities_subplots_dir, create_centralities_list
from tools import lgp, plot_light, plot_light_online, draw_comms,print_communities
# %autoreload 2
warnings.filterwarnings("ignore")
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
G=nx.read_gpickle("chgp1.pic")
lv = len(G.nodes(data=True))
print 'The number of nodes (hashtags) of G is', lv
print
le = len(G.edges(data=True))
print 'The number of edges (hashtag co-occurrences in the same tweets) of G is', le
print
print 'The density of G is', nx.density(G)
print 'G is of type', type(G)
print
print 'Is G directed?', nx.is_directed(G)
print
lv = len(G.nodes(data=True))
print 'Is G connected?', nx.is_connected(G)
print
print 'The number of connected components of G is', nx.number_connected_components(G)
print
print 'The number of maximal cliques of G is', nx.graph_number_of_cliques(G)
print
print 'The average clustering coefficient of G is', nx.average_clustering(G)
ws=[{'from_node':ed[0],'to_node':ed[1],'weight':ed[2]['weight']} for ed in G.edges(data=True)]
edf=pd.DataFrame(ws)
edf.head(20)
# Find edges/weights from a node
node='sığınmacı'
rr=pd.concat([edf[edf['from_node'] == node],edf[edf['to_node'] == node]])
# Find the weight of an edge
nodes=['sığınmacı','mülteci']
# nodes=['sığınmacı','sergios']
nn=edf[edf.from_node.isin(nodes)]
nn[nn.to_node.isin(nodes)]
edf.describe()
plt.figure(figsize=(10,10))
bins=10
ax=sns.distplot(edf['weight'], bins=bins, kde=False, rug=False)
plt.ylabel('Number of Edge Weights')
plt.xlabel('Number of Edges')
tt='The Histogram of Edge Weights'
total = float(len(edf))
wws=[i['weight'] for i in ws]
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)
warnings.filterwarnings("ignore")
degrees=[{'hashtag':i.decode('utf-8'),'degree':nx.degree(G,i)} for i in G.nodes()]
ddf=pd.DataFrame(degrees)
k=0
ddf0=ddf[ddf.degree > k]
ddf0=ddf0[['hashtag','degree']].sort_values(by="degree",ascending=0)
# writer = pd.ExcelWriter('/home/mosesboudourides/Dropbox/Python Projects/DublinNovemer2016 Conf/outs/hashtagsP1.xlsx', engine='xlsxwriter')
# ddegst.to_excel(writer)
# writer.save()
ddf0.head(20)
# Find the degree of some node/hashtag
node_to_search='muslim'
ddf0[ddf0['hashtag'] == node_to_search]
# Searching for an inexistent node/hashtag
print G.has_node('Dublin')
ddf0[ddf0['hashtag'] == 'Dublin']
ddf0.describe()
plt.figure(figsize=(10,10))
bins=10
ax=sns.distplot(ddf0['degree'], bins=bins, kde=False, rug=False)
plt.ylabel('Degree')
plt.xlabel('Number of Nodes')
tt='Degree Histogram'
total = float(len(ddf0))
wws=ddf0.degree.tolist()
# wws=[i['weight'] for i in ws]
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)
warnings.filterwarnings("ignore")
# Handshaking Theorem
sum(ddf0['degree']) == 2*len(G.edges())
ccl=sorted(nx.connected_components(G), key = len, reverse=True)
# Check whether nodes/hashtags are in the same connected component
def find_common_comp(ccl,nodes):
for i,gg in enumerate(ccl):
if all([nd in gg for nd in nodes]):
return i,gg ,'are in'
else:
return ' ', None, 'are not in the same'
# node=['yunanistan']
nodes1=['yunanistan','syria']
nodes1=['yunanistan','byy']
# print 'yunanistan' in G.neighbors('miracle')
nodes2=['yunanistan','usa']
print
# finding=find_common_comp(ccl,node)
finding1=find_common_comp(ccl,nodes1)
finding2=find_common_comp(ccl,nodes2)
print 'Nodes/hashtags', nodes1, '%s connected component %s' %(finding1[2],finding1[0])
print 'Nodes/hashtags', nodes2, '%s connected component %s' %(finding2[2],finding2[0])
ppf=pd.DataFrame([{'graph':i,'size':len(g)} for i,g in enumerate(ccl)])
ppf.describe()
plt.figure(figsize=(10,10))
bins=4
ax=sns.distplot(ppf['size'], bins=bins, kde=False, rug=False)
plt.ylabel('Number of Connected Components')
plt.xlabel('Number of Nodes in Connected Components')
tt='The Histogram of Connected Components'
# ax = sns.distplot(x="class", hue="who", data=titanic)
total = float(len(ppf))
wws=ppf['size'].tolist()
# prin4t wws
# wws=[i['weight'] for i in ws]
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)
warnings.filterwarnings("ignore")
import community as comms
from collections import Counter
part=comms.best_partition(G)
npart=Counter()
nnpart={}#v:k for k,v in part.items()}
for pp,vv in part.items():
npart[vv]+=1
if vv not in nnpart:
nnpart[vv]=[]
nnpart[vv].append(pp)
ppcom=pd.DataFrame([{'community':i,'size':k} for i,k in npart.items()]).sort_values(by="size",ascending=0)
print 'The number of communities is', max(part.values())+1
print 'The graph modularity coefficient is', comms.modularity(part,G)
print
print 'The size of the top 20 communities:'
ppcom.drop('community', axis=1).head(20)
print 'The statistics of community membership:'
ppcom.drop('community', axis=1).describe()
def find_common_comm(part,nodes):
for k,v in part.items():
# print all([nd in v for nd in nodes]), [nd in v for nd in nodes]
if all([nd in v for nd in nodes]):
return k,'are in'
else:
continue
return ' ','are not in the same'
nodes=['yunanistan','syria']
nodes=['yunanistan']
finding=find_common_comm(nnpart,nodes)
print 'Nodes/hashtags ',nodes, '%s community %s ' %(finding[1],finding[0])
# Find nodes/hashtags in a community
commn=40
ll=[]
for k,v in part.items():
if v == commn:
ll.append(k)
print 'The members of community', commn, 'are:'
print ll
plt.figure(figsize=(10,10))
bins=4
ax=sns.distplot(ppcom['size'], bins=bins, kde=False, rug=False)
plt.ylabel('Number of Communities')
plt.xlabel('Number of Nodes in Communities')
tt='The Histogram of Communities'
# ax = sns.distplot(x="class", hue="who", data=titanic)
total = float(len(ppcom))
wws=ppcom['size'].tolist()
# prin4t wws
# wws=[i['weight'] for i in ws]
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)
warnings.filterwarnings("ignore")
k=800
ddf800=ddf[ddf.degree > k]
ddf800=ddf800[['hashtag','degree']].sort_values(by="degree",ascending=0)
ddf800#.head(20)
ddf800.describe()
hashtags=[i.encode('utf-8') for i in ddf800.hashtag.unique()]
Gh=nx.subgraph(G,hashtags)
print 'The network of hashtags with the top %i degrees has %i nodes (hashtags), %i edges and average clustering coefficient %.3f' %(k,len(Gh.nodes()),len(Gh.edges()),nx.average_clustering(Gh))
print
print 'The %i nodes (hashtags) of the network of hashtags with the top %i degrees are:' %(len(Gh.nodes()),k)
print Gh.nodes()
cws=[{'from_node':ed[0],'to_node':ed[1],'weight':ed[2]['weight']} for ed in Gh.edges(data=True)]
cedf=pd.DataFrame(cws).sort_values(by="weight",ascending=0)
cedf.head(20)
nodes=['usa','germany']
nn=edf[edf.from_node.isin(nodes)]
nn[nn.to_node.isin(nodes)]
Ghh=nx.Graph()
for nd in Gh.edges(data=True):
ed=nd[0]
de=nd[1]
att_dici=nd[2]
if 'weight' in att_dici:
wei=att_dici['weight']
else:
wei=0
Ghh.add_edge(ed.decode('utf-8'),de.decode('utf-8'),weight=wei)
pos=nx.circular_layout(Ghh)
edgewidth=[]
for (u,v,d) in Ghh.edges(data=True):
edgewidth.append(d['weight']/500.)
plt.figure(figsize=(12,8))
paris_at=['parisattacks','paris']
cols=['r' if nd in paris_at else 'g' for nd in Ghh.nodes() ]
# print cols
nn1=nx.draw_networkx_nodes(Ghh,pos, node_size=1000,node_color =cols,alpha=0.35) #with_labels=True,
nn2=nx.draw_networkx_edges(Ghh,pos,edge_color='b',width=edgewidth,alpha=0.35)
nn3=nx.draw_networkx_labels(Ghh,pos,font_size=15,font_color="k")
naxis=plt.axis('off')
# # print len(Ghh.nodes())
# group=[1 if nd in paris_at else 0 for nd in Ghh.nodes()]
# vis=plot_light(Ghh,label=2,size=10,group=group)
# vis
centrali=draw_centralities_subplots(Ghh,pos,withLabels=True,labfs=15,figsi=(15,22),ealpha=0.25,vals=True)
dfc=pd.DataFrame()
u=0
for i,k in centrali.items():
# print i,k
if k is None:
continue
dfc.insert(u,i,k.values())
u+=1
dfc.insert(0,'nodes',centrali[centrali.keys()[0]].keys())
# dfc.sort_values(by="betweenness_centrality",ascending=0)
dfc#.head(10)
# dfc[["nodes","degree_centrality"]].sort_values(by="degree_centrality",ascending=0)#.head(10)
# dfc[["nodes","closeness_centrality"]].sort_values(by="closeness_centrality",ascending=0)#.head(10)
dfc[["nodes","betweenness_centrality"]].sort_values(by="betweenness_centrality",ascending=0)#.head(10)
# dfc[["nodes","eigenvector_centrality"]].sort_values(by="eigenvector_centrality",ascending=0)#.head(10)
# dfc[["nodes","katz_centrality"]].sort_values(by="katz_centrality",ascending=0)#.head(10)
# dfc[["nodes","page_rank"]].sort_values(by="page_rank",ascending=0)#.head(10)
part,nodper=print_communities(Ghh,'the graph Ghh')
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.25
ealpha=0.25
vcc={}
# sstta="The %s Communities of %s Network of Selected Noun Phrases" %(max(part.values())+1,titlename)#sstt)
draw_comms(Ghh,Ghh.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,'',titlefont=20,labelfont=17,valpha=0.5)
ego='yunanistan'
alters=nx.neighbors(G,ego)
alters.append(ego)
Ge=nx.subgraph(G,alters)
print 'The %s-egonetwork of hashtags has %i nodes and %i edges' %(ego,len(Ge.nodes()),len(Ge.edges()))
print
print 'The %i nodes of the %s-egonetwork of hashtags are:' %(len(Ge.nodes()),ego)
print Ge.nodes()
import math
Gee=nx.Graph()
for nd in Ge.edges(data=True):
ed=nd[0]
de=nd[1]
att_dici=nd[2]
if 'weight' in att_dici:
wei=att_dici['weight']
else:
wei=0
Gee.add_edge(ed.decode('utf-8'),de.decode('utf-8'),weight=wei)
# print nx.is_connected(Gee)
paris_att=[u'parisunderattack','parisattacks','paris']
# pos=nx.circular_layout(Gee)
pos=nx.nx_agraph.graphviz_layout(Gee)
# nx.spring_layout(Gee)
edgewidth=[]
for (u,v,d) in Gee.edges(data=True):
# edgewidth.append(d['weight']/200.)
edgewidth.append(1+math.log(d['weight']))
plt.figure(figsize=(20,18))
# paris_att=[u'parisattacks',u'paris']
cols=[]
ggroups=[]
for nd in Gee.nodes():
if nd in paris_att:
cols.append('r')
ggroups.append(0)
elif nd == ego:
cols.append('m')
ggroups.append(1)
else:
cols.append('g')
ggroups.append(2)
# print nd,cols
# cols=['r' if nd in paris_att else 'g' for nd in Gee.nodes() ]
# ggroups=[0 if nd in paris_att else 1 for nd in Gee.nodes()]
# print cols
# print cols
nn1=nx.draw_networkx_nodes(Gee,pos, node_size=500,node_color =cols,alpha=0.35) #with_labels=True,
nn2=nx.draw_networkx_edges(Gee,pos,edge_color='b',width=edgewidth,alpha=0.35)
nn3=nx.draw_networkx_labels(Gee,pos,font_size=18,font_color="k")
naxis=plt.axis('off')
# ggroups=[0 if nd in paris_att else 1 for nd in Gee.nodes()]
# print ggroups
# vis=plot_light(Gee,label=2,group=ggroups,size=5)
# vis
# pos=nx.spring_layout(Gee)
centrali=draw_centralities_subplots(Gee,pos,withLabels=True,labfs=10,figsi=(15,22),ealpha=0.25,vals=True)
dfc=pd.DataFrame()
u=0
for i,k in centrali.items():
# print i,k
if k is None:
continue
dfc.insert(u,i,k.values())
u+=1
dfc.insert(0,'nodes',centrali[centrali.keys()[0]].keys())
# dfc.sort_values(by="betweenness_centrality",ascending=0)
dfc#.head(10)
# dfc[["nodes","degree_centrality"]].sort_values(by="degree_centrality",ascending=0)#.head(10)
# dfc[["nodes","closeness_centrality"]].sort_values(by="closeness_centrality",ascending=0)#.head(10)
dfc[["nodes","betweenness_centrality"]].sort_values(by="betweenness_centrality",ascending=0)#.head(10)
# dfc[["nodes","eigenvector_centrality"]].sort_values(by="eigenvector_centrality",ascending=0)#.head(10)
# dfc[["nodes","katz_centrality"]].sort_values(by="katz_centrality",ascending=0)#.head(10)
# dfc[["nodes","page_rank"]].sort_values(by="page_rank",ascending=0)#.head(10)
part,nodper=print_communities(Gee,'the graph Gee')
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.25
ealpha=0.25
vcc={}
# sstta="The %s Communities of %s Network of Selected Noun Phrases" %(max(part.values())+1,titlename)#sstt)
draw_comms(Gee,Gee.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,'',titlefont=20,labelfont=17,valpha=0.5)
egos=['bodrum','parisunderattack']
sst='('
alters=[]
for ego in egos:
sst+=ego+','
for alt in nx.neighbors(G,ego):
alters.append(alt)
alters.append(ego)
sst=sst[:-1]+')'
Ge=nx.subgraph(G,alters)
print 'The %s-egonetwork of hashtags has %i nodes and %i edges' %(sst,len(Ge.nodes()),len(Ge.edges()))
print
print 'The %i nodes of the %s-egonetwork of hashtags are:' %(len(Ge.nodes()),ego)
print Ge.nodes()
import math
GeE=nx.Graph()
for nd in Ge.edges(data=True):
ed=nd[0]
de=nd[1]
att_dici=nd[2]
if 'weight' in att_dici:
wei=att_dici['weight']
else:
wei=0
GeE.add_edge(ed.decode('utf-8'),de.decode('utf-8'),weight=wei)
# print nx.is_connected(Gee)
paris_att=['parisattacks','paris']
# pos=nx.circular_layout(Gee)
# pos=nx.graphviz_layout(Gee)
pos=nx.nx_agraph.graphviz_layout(GeE)
nx.spring_layout(GeE)
edgewidth=[]
for (u,v,d) in GeE.edges(data=True):
# edgewidth.append(d['weight']/200.)
edgewidth.append(1+math.log(d['weight']))
plt.figure(figsize=(20,18))
# paris_att=[u'parisattacks',u'paris']
cols=[]
ggroups=[]
for nd in GeE.nodes():
if nd in paris_att:
cols.append('r')
ggroups.append(0)
elif nd in egos:
cols.append('m')
ggroups.append(1)
else:
cols.append('g')
ggroups.append(2)
# print nd,cols
# cols=['r' if nd in paris_att else 'g' for nd in Gee.nodes() ]
# ggroups=[0 if nd in paris_att else 1 for nd in Gee.nodes()]
# print cols
# print cols
nn1=nx.draw_networkx_nodes(GeE,pos, node_size=500,node_color =cols,alpha=0.35) #with_labels=True,
nn2=nx.draw_networkx_edges(GeE,pos,edge_color='b',width=edgewidth,alpha=0.35)
nn3=nx.draw_networkx_labels(GeE,pos,font_size=18,font_color="k")
naxis=plt.axis('off')
# vis=plot_light(Gee,label=2,group=ggroups,size=5)
# vis
# pos=graphviz_layout(G)
# pos=nx.spring_layout(Gee)
centrali=draw_centralities_subplots(GeE,pos,withLabels=True,labfs=12,figsi=(15,22),ealpha=0.25,vals=True)
dfc=pd.DataFrame()
u=0
for i,k in centrali.items():
# print i,k
if k is None:
continue
dfc.insert(u,i,k.values())
u+=1
dfc.insert(0,'nodes',centrali[centrali.keys()[0]].keys())
# dfc.sort_values(by="betweenness_centrality",ascending=0)
dfc#.head(10)
# nodename='refugees'
# dfc[dfc['Nodes'] == nodename]
# dfc[["nodes","degree_centrality"]].sort_values(by="degree_centrality",ascending=0)#.head(10)
# dfc[["nodes","closeness_centrality"]].sort_values(by="closeness_centrality",ascending=0)#.head(10)
dfc[["nodes","betweenness_centrality"]].sort_values(by="betweenness_centrality",ascending=0)#.head(10)
# dfc[["nodes","eigenvector_centrality"]].sort_values(by="eigenvector_centrality",ascending=0)#.head(10)
# dfc[["nodes","katz_centrality"]].sort_values(by="katz_centrality",ascending=0)#.head(10)
# dfc[["nodes","page_rank"]].sort_values(by="page_rank",ascending=0)#.head(10)
part,nodper=print_communities(GeE,'the graph GeE')
d=0.8
dd=0.8
c=1.2
cc=1.4
alpha=0.25
ealpha=0.25
vcc={}
# sstta="The %s Communities of %s Network of Selected Noun Phrases" %(max(part.values())+1,titlename)#sstt)
draw_comms(GeE,GeE.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nodper,'',titlefont=20,labelfont=17,valpha=0.5)