EXPLORATORY TWITTER NETWORK ANALYSIS WITH PYTHON

I. CO-OCCURRENT HASHTAG NETWORKS

EXTRACTED FROM TWITTER DATA ON REFUGEES IN OCTOBER-DECEMBER 2015

By Moses A. Boudourides & Sergios T. Lenis

In [1]:
%matplotlib inline

import networkx as nx 
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib.pyplot as plt
import matplotlib as mpl
from lightning import Lightning
import pandas as pd
import random
import warnings
import seaborn as sns
sns.set_style("white")
sns.set_style("ticks") 
from tools import draw_centralities, draw_centralities_subplots, draw_centralities_subplots_dir, create_centralities_list
from tools import lgp, plot_light, plot_light_online, draw_comms,print_communities
# %autoreload 2
warnings.filterwarnings("ignore")
In [2]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[2]:

1. The Graph

In [3]:
G=nx.read_gpickle("chgp1.pic")
In [4]:
lv = len(G.nodes(data=True))
print 'The number of nodes (hashtags) of G is', lv
print
le = len(G.edges(data=True))
print 'The number of edges (hashtag co-occurrences in the same tweets) of G is', le
print
print 'The density of G is', nx.density(G)
The number of nodes (hashtags) of G is 31589

The number of edges (hashtag co-occurrences in the same tweets) of G is 142052

The density of G is 0.000284720895654

General Structure of the Graph

In [5]:
print 'G is of type', type(G)
print
print 'Is G directed?', nx.is_directed(G)
print
lv = len(G.nodes(data=True))
print 'Is G connected?', nx.is_connected(G)
print
print 'The number of connected components of G is', nx.number_connected_components(G)
print 
print 'The number of maximal cliques of G is', nx.graph_number_of_cliques(G)
print 
print 'The average clustering coefficient of G is', nx.average_clustering(G)
G is of type <class 'networkx.classes.graph.Graph'>

Is G directed? False

Is G connected? False

The number of connected components of G is 1512

The number of maximal cliques of G is 104120

The average clustering coefficient of G is 0.593698188721

1.1. Statistics of Edge Weights

In [6]:
ws=[{'from_node':ed[0],'to_node':ed[1],'weight':ed[2]['weight']} for ed in G.edges(data=True)]
edf=pd.DataFrame(ws)
edf.head(20)
Out[6]:
from_node to_node weight
0 europeancommission refugee 1
1 europeancommission refugeecrisis 1
2 europeancommission westernbalkans 1
3 adopteerights fox23 1
4 givingjustice refugee 1
5 givingjustice plannedparenthood 1
6 givingjustice givingtuesday 1
7 givingjustice blacklivesmatter 1
8 woods bahrain 2
9 woods countryhouse 1
10 woods jual 2
11 woods country 1
12 woods tigers 2
13 woods imagine 2
14 woods tree 2
15 woods foodpics 2
16 woods ap 2
17 woods losers 2
18 woods house 1
19 woods refugee 1
In [7]:
# Find edges/weights from a node

node='sığınmacı' 
rr=pd.concat([edf[edf['from_node'] == node],edf[edf['to_node'] == node]])
In [8]:
# Find the weight of an edge

nodes=['sığınmacı','mülteci']
# nodes=['sığınmacı','sergios']

nn=edf[edf.from_node.isin(nodes)] 
nn[nn.to_node.isin(nodes)]
Out[8]:
from_node to_node weight
113672 sığınmacı mülteci 45
In [9]:
edf.describe()
Out[9]:
weight
count 142052.000000
mean 6.731021
std 62.685006
min 1.000000
25% 1.000000
50% 1.000000
75% 3.000000
max 7552.000000
In [10]:
plt.figure(figsize=(10,10))
bins=10
ax=sns.distplot(edf['weight'], bins=bins, kde=False, rug=False)
plt.ylabel('Number of Edge Weights')
plt.xlabel('Number of Edges')
tt='The Histogram of Edge Weights' 
total = float(len(edf))
wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")

1.2. Statistics of Node Degrees

In [11]:
degrees=[{'hashtag':i.decode('utf-8'),'degree':nx.degree(G,i)} for i in G.nodes()]
ddf=pd.DataFrame(degrees)

k=0
ddf0=ddf[ddf.degree > k]
ddf0=ddf0[['hashtag','degree']].sort_values(by="degree",ascending=0)

# writer = pd.ExcelWriter('/home/mosesboudourides/Dropbox/Python Projects/DublinNovemer2016 Conf/outs/hashtagsP1.xlsx', engine='xlsxwriter')
# ddegst.to_excel(writer)
# writer.save()

ddf0.head(20)
Out[11]:
hashtag degree
2060 refugee 9842
19557 refugees 3042
18869 syria 2313
27276 refugeecrisis 1927
20622 syrian 1866
19117 refugeeswelcome 1759
16035 syrianrefugees 1743
6054 news 1637
25704 tcot 1242
29570 paris 1222
9981 parisattacks 1160
2325 isis 1157
10340 eu 1094
14169 np 1081
24845 europe 1067
5281 flüchtling 959
2101 germany 952
15250 usa 885
8869 obama 858
6306 crisis 814
In [12]:
# Find the degree of some node/hashtag

node_to_search='muslim'
ddf0[ddf0['hashtag'] == node_to_search]
Out[12]:
hashtag degree
18266 muslim 536
In [13]:
# Searching for an inexistent node/hashtag

print G.has_node('Dublin')
ddf0[ddf0['hashtag'] == 'Dublin']
False
Out[13]:
hashtag degree
In [14]:
ddf0.describe()
Out[14]:
degree
count 31589.000000
mean 8.993764
std 70.910260
min 1.000000
25% 2.000000
50% 3.000000
75% 6.000000
max 9842.000000
In [15]:
plt.figure(figsize=(10,10))
bins=10
ax=sns.distplot(ddf0['degree'], bins=bins, kde=False, rug=False)
plt.ylabel('Degree')
plt.xlabel('Number of Nodes')
tt='Degree Histogram' 
total = float(len(ddf0))
wws=ddf0.degree.tolist()
# wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")
In [16]:
# Handshaking Theorem
sum(ddf0['degree']) == 2*len(G.edges())
Out[16]:
True

1.3. Connected Components

In [17]:
ccl=sorted(nx.connected_components(G), key = len, reverse=True)
In [18]:
# Check whether nodes/hashtags are in the same connected component

def find_common_comp(ccl,nodes):
    for i,gg in enumerate(ccl):
        if all([nd in gg for nd in nodes]):
            return i,gg ,'are in'
        else:
            return ' ', None, 'are not in the same'
        
# node=['yunanistan']
nodes1=['yunanistan','syria']

nodes1=['yunanistan','byy']
# print 'yunanistan' in G.neighbors('miracle')
nodes2=['yunanistan','usa']
print 
# finding=find_common_comp(ccl,node)
finding1=find_common_comp(ccl,nodes1)
finding2=find_common_comp(ccl,nodes2)
print 'Nodes/hashtags', nodes1, '%s connected component %s' %(finding1[2],finding1[0])
print 'Nodes/hashtags', nodes2, '%s connected component %s' %(finding2[2],finding2[0])
Nodes/hashtags ['yunanistan', 'byy'] are not in the same connected component  
Nodes/hashtags ['yunanistan', 'usa'] are in connected component 0
In [19]:
ppf=pd.DataFrame([{'graph':i,'size':len(g)} for i,g in enumerate(ccl)])
In [20]:
ppf.describe()
Out[20]:
graph size
count 1512.000000 1512.000000
mean 755.500000 20.892196
std 436.621117 721.745801
min 0.000000 2.000000
25% 377.750000 2.000000
50% 755.500000 2.000000
75% 1133.250000 2.000000
max 1511.000000 28067.000000
In [21]:
plt.figure(figsize=(10,10))
bins=4
ax=sns.distplot(ppf['size'], bins=bins, kde=False, rug=False)
plt.ylabel('Number of Connected Components')
plt.xlabel('Number of Nodes in Connected Components')
tt='The Histogram of Connected Components' 
# ax = sns.distplot(x="class", hue="who", data=titanic)
total = float(len(ppf))
wws=ppf['size'].tolist()
# prin4t wws
# wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")

1.4. Communities

In [22]:
import community as comms
from collections import Counter
part=comms.best_partition(G)
npart=Counter()
nnpart={}#v:k for k,v in part.items()}
for pp,vv in part.items():
    npart[vv]+=1
    if vv not in nnpart:
        nnpart[vv]=[]
    nnpart[vv].append(pp)
ppcom=pd.DataFrame([{'community':i,'size':k} for i,k in npart.items()]).sort_values(by="size",ascending=0)

print 'The number of communities is', max(part.values())+1
print 'The graph modularity coefficient is', comms.modularity(part,G)
print
print 'The size of the top 20 communities:'
ppcom.drop('community', axis=1).head(20)
The number of communities is 1668
The graph modularity coefficient is 0.518913510348

The size of the top 20 communities:
Out[22]:
size
0 5545
2 3182
8 2966
11 2698
6 2668
5 2479
25 1343
12 1334
14 958
21 748
15 733
20 538
41 397
16 350
19 225
30 196
72 193
56 175
17 112
158 90
In [23]:
print 'The statistics of community membership:'
ppcom.drop('community', axis=1).describe()
The statistics of community membership:
Out[23]:
size
count 1668.000000
mean 18.938249
std 213.512854
min 2.000000
25% 2.000000
50% 2.000000
75% 3.000000
max 5545.000000
In [24]:
def find_common_comm(part,nodes):
    for k,v in part.items():
#         print  all([nd in v for nd in nodes]),  [nd in v for nd in nodes]
        if all([nd in v for nd in nodes]):
            return k,'are in'
        else:
            continue
    return ' ','are not in the same'
nodes=['yunanistan','syria']
nodes=['yunanistan']       
finding=find_common_comm(nnpart,nodes)
print 'Nodes/hashtags ',nodes, '%s community %s ' %(finding[1],finding[0])
Nodes/hashtags  ['yunanistan'] are in community 14 
In [25]:
# Find nodes/hashtags in a community

commn=40
ll=[]
for k,v in part.items():
    if v == commn:
        ll.append(k)
print 'The members of community', commn, 'are:'
print ll
The members of community 40 are:
['sportsdiplomacy', 'pdnews', 'sportsmedia', 'premiership', 'internationaldevelopment', 'bpl', 'premierleague', 'epl']
In [26]:
plt.figure(figsize=(10,10))
bins=4
ax=sns.distplot(ppcom['size'], bins=bins, kde=False, rug=False)
plt.ylabel('Number of Communities')
plt.xlabel('Number of Nodes in Communities')
tt='The Histogram of Communities' 
# ax = sns.distplot(x="class", hue="who", data=titanic)
total = float(len(ppcom))
wws=ppcom['size'].tolist()
# prin4t wws
# wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")

2. Graph Cuts by Degree

In [27]:
k=800
ddf800=ddf[ddf.degree > k]
ddf800=ddf800[['hashtag','degree']].sort_values(by="degree",ascending=0)

ddf800#.head(20)
Out[27]:
hashtag degree
2060 refugee 9842
19557 refugees 3042
18869 syria 2313
27276 refugeecrisis 1927
20622 syrian 1866
19117 refugeeswelcome 1759
16035 syrianrefugees 1743
6054 news 1637
25704 tcot 1242
29570 paris 1222
9981 parisattacks 1160
2325 isis 1157
10340 eu 1094
14169 np 1081
24845 europe 1067
5281 flüchtling 959
2101 germany 952
15250 usa 885
8869 obama 858
6306 crisis 814
In [28]:
ddf800.describe()
Out[28]:
degree
count 20.000000
mean 1831.000000
std 1968.924553
min 814.000000
25% 1040.000000
50% 1191.000000
75% 1785.750000
max 9842.000000
In [29]:
hashtags=[i.encode('utf-8') for i in ddf800.hashtag.unique()] 
Gh=nx.subgraph(G,hashtags)

print 'The network of hashtags with the top %i degrees has %i nodes (hashtags), %i edges and average clustering coefficient %.3f' %(k,len(Gh.nodes()),len(Gh.edges()),nx.average_clustering(Gh))
print
print 'The %i nodes (hashtags) of the network of hashtags with the top %i degrees are:' %(len(Gh.nodes()),k)
print Gh.nodes()
The network of hashtags with the top 800 degrees has 20 nodes (hashtags), 171 edges and average clustering coefficient 0.959

The 20 nodes (hashtags) of the network of hashtags with the top 800 degrees are:
['europe', 'parisattacks', 'usa', 'isis', 'fl\xc3\xbcchtling', 'tcot', 'paris', 'syrian', 'refugee', 'syrianrefugees', 'np', 'germany', 'refugeeswelcome', 'eu', 'syria', 'news', 'refugeecrisis', 'refugees', 'crisis', 'obama']
In [30]:
cws=[{'from_node':ed[0],'to_node':ed[1],'weight':ed[2]['weight']} for ed in Gh.edges(data=True)]
cedf=pd.DataFrame(cws).sort_values(by="weight",ascending=0)
cedf.head(20)
Out[30]:
from_node to_node weight
109 refugee refugeecrisis 5625
110 refugee syrian 3455
107 refugee syria 3223
112 refugee obama 3171
77 paris refugee 2620
38 isis refugee 2473
7 europe refugee 1750
105 refugee refugeeswelcome 1727
103 refugee eu 1508
94 refugees syria 1462
20 syrianrefugees tcot 1370
64 usa refugee 1346
75 paris tcot 1343
97 refugees refugeecrisis 1342
101 refugee tcot 1331
78 paris parisattacks 1232
86 paris syrian 1167
104 refugee germany 1139
102 refugee parisattacks 1112
49 isis obama 1032
In [31]:
nodes=['usa','germany']
nn=edf[edf.from_node.isin(nodes)] 
nn[nn.to_node.isin(nodes)]
Out[31]:
from_node to_node weight
25401 germany usa 27

2.1. Visualization

In [32]:
Ghh=nx.Graph()
for nd in Gh.edges(data=True):
    ed=nd[0]
    de=nd[1]
    att_dici=nd[2]
    if 'weight' in att_dici:
        wei=att_dici['weight']
    else:
        wei=0
    
    Ghh.add_edge(ed.decode('utf-8'),de.decode('utf-8'),weight=wei)

pos=nx.circular_layout(Ghh)

edgewidth=[]
for (u,v,d) in Ghh.edges(data=True):
    edgewidth.append(d['weight']/500.)
plt.figure(figsize=(12,8))
paris_at=['parisattacks','paris']
cols=['r' if nd in paris_at else 'g' for nd in Ghh.nodes() ]
# print cols
nn1=nx.draw_networkx_nodes(Ghh,pos, node_size=1000,node_color =cols,alpha=0.35) #with_labels=True,
nn2=nx.draw_networkx_edges(Ghh,pos,edge_color='b',width=edgewidth,alpha=0.35)
nn3=nx.draw_networkx_labels(Ghh,pos,font_size=15,font_color="k")
naxis=plt.axis('off')
In [33]:
# # print len(Ghh.nodes())
# group=[1 if nd in paris_at else 0 for nd in Ghh.nodes()]
# vis=plot_light(Ghh,label=2,size=10,group=group)
# vis

2.2. Centralities

In [34]:
centrali=draw_centralities_subplots(Ghh,pos,withLabels=True,labfs=15,figsi=(15,22),ealpha=0.25,vals=True)