EXPLORATORY TWITTER NETWORK ANALYSIS WITH PYTHON

II. USERS' MENTIONS NETWORKS

EXTRACTED FROM TWITTER DATA ON REFUGEES IN OCTOBER-DECEMBER 2015

By Moses A. Boudourides & Sergios T. Lenis

In [1]:
%matplotlib inline

import networkx as nx 
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib.pyplot as plt
import matplotlib as mpl
from lightning import Lightning
import pandas as pd
import random
import warnings
import seaborn as sns
sns.set_style("white")
sns.set_style("ticks") 
from tools import draw_centralities, draw_centralities_subplots,draw_centralities_subplots_dir,  create_centralities_list
from tools import lgp, plot_light, plot_light_online,draw_comms,print_communities
# %autoreload 2
warnings.filterwarnings("ignore")
In [2]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[2]:

1. The Graph

In [3]:
G=nx.read_gpickle("umgp1.pic")
In [4]:
names={i:G.node[i]['name'] for i in G.nodes()}
invnames = {v:k for k, v in names.items()}

# fop=open('/home/mosesboudourides/Dropbox/Python Projects/DublinNovemer2016 Conf/outs/names_dict.txt','w')
# for k,v in names.items():
#     fop.write('%s , %s\n' %(k,v))
# fop.close()
In [5]:
print 'The total number of users is', len(names)
# nf=[]
# for k,v in names.items():
#     if v == 'Not found'
nf = [k for k,v in names.items() if v == 'Not found']
print 'Te number of users without screen-names is', len(nf)
The total number of users is 107630
Te number of users without screen-names is 21944
In [6]:
lv = len(G.nodes(data=True))
print 'The number of nodes (users) of G is', lv
print
le = len(G.edges(data=True))
print 'The number of edges (mentions) of G is', le
print
print 'The density of G is', nx.density(G)
The number of nodes (users) of G is 107630

The number of edges (mentions) of G is 107240

The density of G is 9.25750936049e-06

General Structure of the Graph

In [7]:
print 'G is of type', type(G)
print
print 'Is G directed?', nx.is_directed(G)
print
lv = len(G.nodes(data=True))
print 'Is G strongly connected?', nx.is_strongly_connected(G)
print
print 'The number of strongly connected components of G is', nx.number_strongly_connected_components(G)
print 
print 'Is G weakly connected?', nx.is_weakly_connected(G)
print
print 'The number of weakly connected components of G is', nx.number_weakly_connected_components(G)
G is of type <class 'networkx.classes.digraph.DiGraph'>

Is G directed? True

Is G strongly connected? False

The number of strongly connected components of G is 104869

Is G weakly connected? False

The number of weakly connected components of G is 25483

1.1. Statistics of Edge Weights

In [8]:
ws=[{'from_node':names[ed[0]],'to_node':names[ed[1]],'weight':ed[2]['weight']} for ed in G.edges(data=True)]
edf=pd.DataFrame(ws)
edf.head(20)

# writer = pd.ExcelWriter('/home/mosesboudourides/Dropbox/Python Projects/DublinNovemer2016 Conf/outs/n_mentions_P1.xlsx', engine='xlsxwriter')
# edf.to_excel(writer)
# writer.save()
Out[8]:
from_node to_node weight
0 alec_dh mcclure111 1
1 joshbeall Not found 2
2 Tierrah46 KLSouth 1
3 Tierrah46 BillPeriman 1
4 Tierrah46 AsaHutchinson 1
5 Tierrah46 ljcambria 1
6 Tierrah46 Not found 1
7 Tierrah46 tradethecycles 1
8 Tierrah46 Not found 1
9 Tierrah46 SenTomCotton 1
10 Tierrah46 blove65 3
11 Tierrah46 JohnBoozman 1
12 IlhamSalim2 Not found 1
13 HotLiberalGuy Morning_Joe 1
14 okanzmen5 fatihportakal 1
15 00JStafford00 Not found 1
16 OhioCoastie KurtSchlichter 1
17 OhioCoastie OhioCoastie 1
18 OhioCoastie stjohnswoods 1
19 bmiller3504 Not found 1
In [9]:
# Find edges/weights from a user

node='refugee_archive' 
pd.concat([edf[edf['from_node'] == node],edf[edf['to_node'] == node]])
Out[9]:
from_node to_node weight
40059 refugee_archive JuliaBlocher 2
40060 refugee_archive mishmadsen 1
40061 refugee_archive EleanorDavey 1
40062 refugee_archive Lwam_Tesfay 1
40063 refugee_archive livuniLUCAS 1
40064 refugee_archive refugee_archive 2
40065 refugee_archive PaulDudman 1
40066 refugee_archive Not found 1
40067 refugee_archive IHRday_UEL 1
11880 Cop21Direct refugee_archive 1
24200 kirrisriviere refugee_archive 1
24821 PosNegOrg refugee_archive 1
38983 EleanorDavey refugee_archive 1
40064 refugee_archive refugee_archive 2
50313 JuliaBlocher refugee_archive 1
50773 drbrainspatula refugee_archive 1
55227 1_bunty refugee_archive 1
55343 A_Human_Crisis refugee_archive 1
56633 worldinlondon refugee_archive 1
60040 aseelsaaddawood refugee_archive 1
61425 CDB_77 refugee_archive 1
62922 StrongCitizens refugee_archive 4
64631 livuniLUCAS refugee_archive 1
72187 swamibami refugee_archive 1
77683 IHRday_UEL refugee_archive 1
83279 adimendoza8 refugee_archive 1
86386 CivicLeicester refugee_archive 1
102203 BalkanNewsbeat refugee_archive 1
In [10]:
# Find the weight of an edge

nodes=['TheresaMechele','realDonaldTrump']
# nodes=['dennyreedart','sergios']

nn=edf[edf.from_node.isin(nodes)] 
nn[nn.to_node.isin(nodes)]
Out[10]:
from_node to_node weight
81692 TheresaMechele realDonaldTrump 32
In [11]:
edf.describe()
Out[11]:
weight
count 107240.000000
mean 1.157301
std 1.109615
min 1.000000
25% 1.000000
50% 1.000000
75% 1.000000
max 103.000000
In [12]:
plt.figure(figsize=(10,10))
bins=10
ax=sns.distplot(edf['weight'], bins=bins, kde=False, rug=False)
plt.ylabel('Number of Edge Weights')
plt.xlabel('Number of Edges')
tt='The Histogram of Edge Weights' 
total = float(len(edf))
wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")

1.2. Statistics of Node Degrees

In [13]:
degrees=[{'user':names[i],'total_degree':nx.degree(G,i),'in_degree':G.in_degree(i),'out_degree':G.out_degree(i)} for i in G.nodes()]
ddf=pd.DataFrame(degrees)

k=0
ddf0=ddf[ddf.total_degree > k]
ddf0=ddf0[['user','in_degree','out_degree','total_degree']].sort_values(by="total_degree",ascending=0)

# writer = pd.ExcelWriter('/home/mosesboudourides/Dropbox/Python Projects/DublinNovemer2016 Conf/outs/n_mentions_degrees_P1.xlsx', engine='xlsxwriter')
# ddf0.to_excel(writer)
# writer.save()

ddf0.head(20)
Out[13]:
user in_degree out_degree total_degree
87104 realDonaldTrump 1046 0 1046
9192 Not found 595 0 595
77185 FoxNews 545 0 545
68607 EbolaOutbreakUS 2 539 541
36855 SpeakerRyan 536 0 536
41069 BlueLotusDC 2 494 496
42162 CNN 435 0 435
49167 Not found 417 0 417
91441 feru012 1 401 402
31357 ShareTheMealorg 4 374 378
20095 tedcruz 359 1 360
8259 Lateenohs4Trump 10 288 298
8282 seanhannity 270 0 270
104682 GOP_Refugee_16 1 269 270
104573 Not found 250 0 250
33165 mgtmag13 2 236 238
56846 WhiteHouse 235 0 235
9711 HillaryClinton 234 0 234
2444 migrantslives 2 224 226
44906 thehill 198 0 198
In [14]:
# Find the degrees of some node/user

user_to_search='U2_REFUGEE'
ddf0[ddf0['user'] == user_to_search]
Out[14]:
user in_degree out_degree total_degree
45692 U2_REFUGEE 37 38 75
In [15]:
# Searching for an inexistent node/user

print G.has_node('mosabou')
ddf0[ddf0['user'] == 'mosabou']
False
Out[15]:
user in_degree out_degree total_degree
In [16]:
ddf0.describe()
Out[16]:
in_degree out_degree total_degree
count 107630.000000 107630.000000 107630.000000
mean 0.996376 0.996376 1.992753
std 6.128845 4.000988 7.323669
min 0.000000 0.000000 1.000000
25% 0.000000 0.000000 1.000000
50% 1.000000 1.000000 1.000000
75% 1.000000 1.000000 2.000000
max 1046.000000 539.000000 1046.000000
In [17]:
plt.figure(figsize=(10,10))
bins=10
ax=sns.distplot(ddf0['total_degree'], bins=bins, kde=False, rug=False)
plt.ylabel('Total Degree')
plt.xlabel('Number of Nodes')
tt='Total Degree Histogram' 
total = float(len(ddf0))
wws=ddf0.total_degree.tolist()
# wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")
In [18]:
plt.figure(figsize=(10,10))
bins=10
ax=sns.distplot(ddf0['in_degree'], bins=bins, kde=False, rug=False)
plt.ylabel('In-Degree')
plt.xlabel('Number of Nodes')
tt='In-Degree Histogram' 
total = float(len(ddf0))
wws=ddf0.in_degree.tolist()
# wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")
In [19]:
plt.figure(figsize=(10,10))
bins=10
ax=sns.distplot(ddf0['out_degree'], bins=bins, kde=False, rug=False)
plt.ylabel('Out-Degree')
plt.xlabel('Number of Nodes')
tt='Out-Degree Histogram' 
total = float(len(ddf0))
wws=ddf0.out_degree.tolist()
# wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")
In [20]:
# Handshaking Theorem
sum(ddf0['out_degree']) + sum(ddf0['in_degree']) == 2*len(G.edges())
Out[20]:
True

1.3. Connected Components

1.3.1. Strongly Connected Components

In [21]:
sccl=sorted(nx.strongly_connected_components(G), key = len, reverse=True)
In [22]:
print sccl[100]
set(['3042923567', '2857471925', '573952577'])
In [23]:
# Check whether nodes/users are in the same strongly connected component

def find_common_comp(sccl,nodes):
    for i,gg in enumerate(sccl):
        if all([nd in gg for nd in nodes]):
            return i,gg ,'are in'
        else:
            continue           
    return ' ', None, 'are not in the same'
        
nodes1=['2857471925','573952577']
nnodes1=[names['2857471925'],names['573952577']]
# print sccl[scc]
nodes2=['5739525778', '262993520']
nnodes2=[names['573952577'], names['262993520']]
# nodes2=['540238413', '2764646734']
# nnodes2=[names['540238413'], names['2764646734']]
print 
finding1=find_common_comp(sccl,nodes1)
finding2=find_common_comp(sccl,nodes2)
print 'Nodes/users', nnodes1, '%s strongly connected component %s' %(finding1[2],finding1[0])
print 'Nodes/users', nnodes2, '%s strongly connected component %s' %(finding2[2],finding2[0])
Nodes/users ['IngermarTurner', 'fliss59'] are in strongly connected component 100
Nodes/users ['fliss59', 'Tankmanbrad'] are not in the same strongly connected component  
In [24]:
ppf=pd.DataFrame([{'graph':i,'size':len(g)} for i,g in enumerate(sccl)])
In [25]:
ppf.describe()
Out[25]:
graph size
count 104869.000000 104869.000000
mean 52434.000000 1.026328
std 30273.217027 1.497918
min 0.000000 1.000000
25% 26217.000000 1.000000
50% 52434.000000 1.000000
75% 78651.000000 1.000000
max 104868.000000 480.000000
In [26]:
plt.figure(figsize=(10,10))
bins=6
ax=sns.distplot(ppf['size'], bins=bins, kde=False, rug=False)
plt.ylabel('Number of Strongly Connected Components')
plt.xlabel('Number of Nodes in Strongly Connected Components')
tt='The Histogram of Strongly Connected Components' 
# ax = sns.distplot(x="class", hue="who", data=titanic)
total = float(len(ppf))
wws=ppf['size'].tolist()
# prin4t wws
# wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")

1.3.2. Weakly Connected Components

In [27]:
wccl=sorted(nx.weakly_connected_components(G), key = len, reverse=True)
In [28]:
# Check whether nodes/users are in the same weakly connected component

def find_common_comp(wccl,nodes):
    for i,gg in enumerate(wccl):
        if all([nd in gg for nd in nodes]):
            return i,gg ,'are in'
        else:
            continue         
    return ' ', None, 'are not in the same'
        
nodes1=['213642461', '3364815491']
nnodes1=[names['213642461'], names['3364815491']]
# print sccl[scc]
nodes2=['101633298', '262993520']
nnodes2=[names['101633298'], names['262993520']]
nodes2=['540238413', '3364815491']
nnodes2=[names['540238413'], names['3364815491']]
print 
finding1=find_common_comp(wccl,nodes1)
finding2=find_common_comp(wccl,nodes2)
print 'Nodes/users', nnodes1, '%s weakly connected component %s' %(finding1[2],finding1[0])
print 'Nodes/users', nnodes2, '%s weakly connected component %s' %(finding2[2],finding2[0])
Nodes/users ['jodafone_1', '5EurSchlaucher'] are in weakly connected component 100
Nodes/users ['calLOreo', '5EurSchlaucher'] are not in the same weakly connected component  
In [29]:
ppf=pd.DataFrame([{'graph':i,'size':len(g)} for i,g in enumerate(wccl)])
In [30]:
ppf.describe()
Out[30]:
graph size
count 25483.000000 25483.000000
mean 12741.000000 4.223600
std 7356.452791 338.018897
min 0.000000 1.000000
25% 6370.500000 2.000000
50% 12741.000000 2.000000
75% 19111.500000 2.000000
max 25482.000000 53961.000000
In [31]:
plt.figure(figsize=(10,10))
bins=6
ax=sns.distplot(ppf['size'], bins=bins, kde=False, rug=False)
plt.ylabel('Number of Weakly Connected Components')
plt.xlabel('Number of Nodes in Weakly Connected Components')
tt='The Histogram of Weakly Connected Components' 
# ax = sns.distplot(x="class", hue="who", data=titanic)
total = float(len(ppf))
wws=ppf['size'].tolist()
# prin4t wws
# wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")

1.4. Communities

In [32]:
import community as comms
from collections import Counter
Gg=nx.Graph(G)
part=comms.best_partition(Gg)
npart=Counter()
nnpart={}#v:k for k,v in part.items()}
for pp,vv in part.items():
    npart[vv]+=1
    if vv not in nnpart:
        nnpart[vv]=[]
    nnpart[vv].append(pp)
ppcom=pd.DataFrame([{'community':i,'size':k} for i,k in npart.items()]).sort_values(by="size",ascending=0)

print 'The number of communities is', max(part.values())+1
print 'The graph modularity coefficient is', comms.modularity(part,Gg)
print
print 'The size of the top 20 communities:'
ppcom.drop('community', axis=1).head(20)
The number of communities is 25608
The graph modularity coefficient is 0.887183792201

The size of the top 20 communities:
Out[32]:
size
2 5649
24 4112
5 3756
1 3286
93 3130
81 2027
87 1788
68 1718
88 1715
19 1568
50 1353
29 967
71 762
22 711
27 631
25 628
69 624
30 594
39 590
80 543
In [33]:
print 'The statistics of community membership:'
ppcom.drop('community', axis=1).describe()
The statistics of community membership:
Out[33]:
size
count 25608.000000
mean 4.202983
std 65.792478
min 1.000000
25% 2.000000
50% 2.000000
75% 2.000000
max 5649.000000
In [34]:
def find_common_comm(part,nodes):
    for k,v in part.items():
#         print  all([nd in v for nd in nodes]),  [nd in v for nd in nodes]
        if all([nd in v for nd in nodes]):
            return k,'are in'
        else:
            continue
    return ' ','are not in the same'
nodes=['1046136978', '1653958645']
nnodes=[names['1046136978'], names['1653958645']]
# nodes=['2764646734']   
finding=find_common_comm(nnpart,nodes)
print 'Nodes/users ',nnodes, '%s community %s ' %(finding[1],finding[0])
Nodes/users  ['cleanpulse', 'djkozerow'] are in community 40 
In [35]:
# Find nodes/users in a community

commn=65
ll=[]
for k,v in part.items():
    if v == commn:
        ll.append(names[k])
print 'The members of community', commn, 'are:'
print ll
The members of community 65 are:
['PrepperAgenda', 'dcoulter45', 'WilliamHelmut', 'onecatseye', 'briandahlen', 'ShoopdasWoop1', 'TheFloatingDuck', 'Kar_hart', 'CowboyYak3', 'Not found', 'ronco_59', 'medicsunny', 'DennyBurk', 'jordielife', 'ARFueller89', 'andynagel', 'steve_kneale', 'RustyBucketUSA', 'stevenbwagner', 'snaggletoothred', 'pmalmanac', 'theratzpack', '6InchesofFury', 'marinobuzz13', 'RickRWells', 'AO_in_PA', 'Not found', 'ComicDaveEvans', 'CrosseyedM', 'iliveamongyou', 'SustainablDylan', 'SarahWallace4NY', 'High_n_Right', 'GirlDayTrader', 'irena_ocasio', 'suzzie38', 'gdsmack267', 'CafeConLecheGOP', 'BrewCitriot', 'tlewis417', 'Dennisdevoe', 'TrevinWax', 'Not found', 'Trace_Me14', 'Not found', 'jonesyregu', 'CougarNav', 'RBRYANPARKER', 'holmesymrs', 'Not found', 'ShalashMuh', 'jmelone277', 'Constitution_al', 'robstaintonboss', 'jimcitak', 'Not found', 'TechHelp', 'Richzeoli', 'JoeThebabe2015', 'jaymielynnie', 'toorsdenote', 'N_ath_aniel', 'Not found', 'jcvolt', 'javiermgonzales', 'trudeaumistress', 'Not found', 'Not found', 'Not found', 'jramic5', 'Not found', 'joe46and2', 'Andimgrant', 'MsDeemer', 'unclechunka', 'def_rec', 'ATC_Sleepers', 'Not found', 'ChrisStigall', 'billpeduto', 'mschmidt74', 'Not found', 'KinsellaJohnP', 'AmyKTT', 'asdindiana', 'ProudInfidel33', 'elliemurphie', 'LorraineMollura', 'georgesoros', 'RevElmerFudd', 'hingdpotter', 'MikeDarnay', 'BiHiRiverOfLife', 'lesortiz610', 'Independent_ie', 'legalatina', 'bull_shite', 'a1985mtr', 'Bwana86', 'Not found', 'Not found', 'Dudeman1999', 'WoodenLucy', 'jrcannonq1', 'Not found', 'RichardTBurnett', 'mlvince', 'BlackbirdCroon', 'duggan_paul', 'Ethan_Tinsman', 'ToneLocNV', 'Not found', 'JanePitt', 'n3fario', 'Pursuit222', 'hartman_nick', 'Not found', 'ericainidaho', 'Not found', 'treesher23', 'JohnPastirchak', 'GilaGal', 'Carolyn_Bennett', 'DomStampone', 'MGMTino', 'Anthony101011', 'grey_dubya', 'CStrohal', 'yieldright', 'PanCyred', 'MayorMarkK', 'onlineonlytoday', 'joegridingteeth', 'agent_it', 'adambeforevade', 'BFR9', 'implantdocFDM', 'PGHomes', 'skorz42', 'rockinrobyn59', 'Leppycole', 'LyndaMick', 'mghjmh', 'tnlawgirl', 'Not found', 'BullyUzi', 'Not found', 'DrDaveOrts', 'memereto4', 'James96Kyle', 'masoudkr', 'wino75', 'beyaself1', 'HenryMcRandall1', 'hebrewhammer412', '100monkeys', 'JimLaPorta', 'Smoe4Bucks', 'ajodom60', 'Not found', 'TheDr21', 'kasokie', 'Not found', 'DigginDirty', 'Martouff', 'brianalmon', 'Not found', 'JShahadi', 'vijaygkg', 'kirbs71', 'Not found', 'BioshockLGP', 'Farting_Nudist', 'REPUBL_I_CAN', 'adamcurry']
In [36]:
plt.figure(figsize=(10,10))
bins=4
ax=sns.distplot(ppcom['size'], bins=bins, kde=False, rug=False)
plt.ylabel('Number of Communities')
plt.xlabel('Number of Nodes in Communities')
tt='The Histogram of Communities' 
# ax = sns.distplot(x="class", hue="who", data=titanic)
total = float(len(ppcom))
wws=ppcom['size'].tolist()
# prin4t wws
# wws=[i['weight'] for i in ws]
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+(((max(wws)/float(bins))/2)-2), height+ 3, '%i'%(height))#/total))
plt.title(tt)

warnings.filterwarnings("ignore")

2. Graph Cuts by Total Degree

In [37]:
k=300
ddf300=ddf[ddf.total_degree > k]
ddf300=ddf300[['user','total_degree']].sort_values(by="total_degree",ascending=0)

ddf300#.head(20)
Out[37]:
user total_degree
87104 realDonaldTrump 1046
9192 Not found 595
77185 FoxNews 545
68607 EbolaOutbreakUS 541
36855 SpeakerRyan 536
41069 BlueLotusDC 496
42162 CNN 435
49167 Not found 417
91441 feru012 402
31357 ShareTheMealorg 378
20095 tedcruz 360
In [38]:
ddf300.describe()
Out[38]:
total_degree
count 11.000000
mean 522.818182
std 190.166673
min 360.000000
25% 409.500000
50% 496.000000
75% 543.000000
max 1046.000000
In [39]:
users=[i for i in ddf300['user'].unique()] 
nusers = [invnames[u] for u in users]
Gh=nx.subgraph(G,nusers)
# Ghh=nx.DiGraph()
# for eds in Gh.edges():
#     ed=G.node[eds[0]]['name']
#     de=G.node[eds[1]]['name']
#     if ed=='Not found':
#         ed=eds[0]
#     if de=='Not found':
#         de=eds[1]
#     Ghh.add_edge(ed,de,weight=G[eds[0]][eds[1]]['weight'])

print 'The network of users with the top %i total degrees has %i nodes and %i edges' %(k,len(Gh.nodes()),len(Gh.edges()))
print
print 'The %i nodes of the network of users with the top %i total degrees are:' %(len(Gh.nodes()),k)
print Gh.nodes()
print users #Gh.nodes()
The network of users with the top 300 total degrees has 10 nodes and 10 edges

The 10 nodes of the network of users with the top 300 total degrees are:
['759251', '2211070837', '391128732', '25073877', '1367531', '18916432', '20524629', '23022687', '89735084', '2842673216']
['realDonaldTrump', 'Not found', 'FoxNews', 'EbolaOutbreakUS', 'SpeakerRyan', 'BlueLotusDC', 'CNN', 'feru012', 'ShareTheMealorg', 'tedcruz']
In [40]:
names['2211070837']
Out[40]:
'ShareTheMealorg'
In [41]:
cws=[{'from_node':names[ed[0]], 'to_node':names[ed[1]],'weight':ed[2]['weight']} for ed in Gh.edges(data=True)]
cedf=pd.DataFrame(cws).sort_values(by="weight",ascending=0)
cedf #.head(20)
Out[41]:
from_node to_node weight
3 BlueLotusDC CNN 9
5 BlueLotusDC realDonaldTrump 5
8 feru012 realDonaldTrump 2
0 EbolaOutbreakUS FoxNews 1
1 EbolaOutbreakUS EbolaOutbreakUS 1
2 EbolaOutbreakUS realDonaldTrump 1
4 BlueLotusDC BlueLotusDC 1
6 feru012 SpeakerRyan 1
7 feru012 CNN 1
9 tedcruz tedcruz 1
In [42]:
print 'The following nodes are isolates:'
for i in list(nx.isolates(Gh)):
    print i, names[i]
The following nodes are isolates:
2211070837 ShareTheMealorg
89735084 Not found

2.1. Visualization

In [43]:
Ghh=nx.DiGraph()
for nd in Gh.edges(data=True):
    ed=nd[0]
    de=nd[1]
    att_dici=nd[2]
    if 'weight' in att_dici:
        wei=att_dici['weight']
    else:
        wei=0
    
    Ghh.add_edge(ed.decode('utf-8'),de.decode('utf-8'),weight=wei)

# pos=nx.circular_layout(Ghh)
pos=nx.nx.nx_agraph.graphviz_layout(Ghh)


edgewidth=[]
for (u,v,d) in Ghh.edges(data=True):
#     edgewidth.append(d['weight']/500.)

    edgewidth.append(d['weight'])
plt.figure(figsize=(12,8))
paris_at=['parisattacks','paris']

labels={i:names[i] if names[i]!='Not found' else i for i in Ghh.nodes()}
cols=['r' if nd in paris_at else 'g' for nd in Ghh.nodes() ]
nn1=nx.draw_networkx_nodes(Ghh,pos, node_size=1000,node_color =cols,alpha=0.35) #with_labels=True,
nn2=nx.draw_networkx_edges(Ghh,pos,edge_color='b',width=edgewidth,alpha=0.35)
nn3=nx.draw_networkx_labels(Ghh,pos,labels=labels,font_size=15,font_color="k")
naxis=plt.axis('off')
In [44]:
# # print len(Ghh.nodes())
# print nx.neighbors(G,'23022687')
# group=[1 if nd in paris_at else 0 for nd in Ghh.nodes()]
# vis=plot_light(Ghh,label=2,size=10,group=group)
# vis

2.2. Centralities

In [45]:
centrali=draw_centralities_subplots(Ghh,pos,withLabels=True,labfs=15,figsi=(15,22),ealpha=0.25,vals=True)
In [46]:
dfc=pd.DataFrame()
user_names=[names[nd] if names[nd]!='Not found' else nd for nd in centrali[centrali.keys()[0]].keys()]
u=0
for i,k in centrali.items():
    if k is None:
        continue
    dfc.insert(u,i,k.values())
    u+=1
dfc.insert(0,'users',user_names)#centrali[centrali.keys()[0]].keys())
dfc#.head(10)
Out[46]:
users closeness_centrality katz_centrality betweenness_centrality page_rank eigenvector_centrality degree_centrality
0 feru012 0.428571 0.049702 0.0 0.105589 0.090909 0.428571
1 EbolaOutbreakUS 0.285714 0.126802 0.0 0.084040 0.000000 0.571429
2 FoxNews 0.000000 0.126802 0.0 0.084040 0.090909 0.142857
3 SpeakerRyan 0.000000 0.079922 0.0 0.401502 0.090909 0.142857
4 BlueLotusDC 0.285714 0.126802 0.0 0.063846 0.818182 0.571429
5 tedcruz 0.000000 0.126802 0.0 0.073027 0.545455 0.285714
6 realDonaldTrump 0.000000 0.572741 0.0 0.127727 0.000000 0.428571
7 CNN 0.000000 0.773819 0.0 0.060228 0.090909 0.285714
In [47]:
# dfc[["users","degree_centrality"]].sort_values(by="degree_centrality",ascending=0)#.head(10)
dfc[["users","closeness_centrality"]].sort_values(by="closeness_centrality",ascending=0)#.head(10)
# dfc[["users","betweenness_centrality"]].sort_values(by="betweenness_centrality",ascending=0)#.head(10)
# dfc[["users","eigenvector_centrality"]].sort_values(by="eigenvector_centrality",ascending=0)#.head(10)
# dfc[["users","katz_centrality"]].sort_values(by="katz_centrality",ascending=0)#.head(10)
# dfc[["users","page_rank"]].sort_values(by="page_rank",ascending=0)#.head(10)
Out[47]:
users closeness_centrality
0 feru012 0.428571
1 EbolaOutbreakUS 0.285714
4 BlueLotusDC 0.285714
2 FoxNews 0.000000
3 SpeakerRyan 0.000000
5 tedcruz 0.000000
6 realDonaldTrump 0.000000
7 CNN 0.000000

2.3. Communities

In [48]:
Ghh=nx.Graph(Ghh)
part,nodper=print_communities(Ghh,'the graph Ghh')
# 
nnodper={i:names[i] for i in nodper}

d=0.8 
dd=0.8
c=1.2
cc=1.4
alpha=0.25
ealpha=0.25
vcc={}
# sstta="The %s Communities of %s Network of Selected Noun Phrases" %(max(part.values())+1,titlename)#sstt)

draw_comms(Ghh,Ghh.nodes(),[],[],[] ,part,part,d,dd,c,cc,alpha,ealpha,nnodper,'',titlefont=20,labelfont=17,valpha=0.5)
Number of communities of the graph Ghh = 4
Community partition of the graph Ghh:
[[u'391128732', u'18916432', u'25073877'], [u'2842673216', u'1367531'], [u'20524629', u'759251'], [u'23022687']]
Community modularity of the graph Ghh = 0.2826

3. Egocentric Graph Cuts

3.1. One Ego

In [49]:
ego='463666759'
nego=names[ego]
alters=G.successors(ego)
# alters=G.predecessors(ego)

alters.append(ego)
Ge=nx.subgraph(G,alters)

print 'The ego is user', nego
print
print 'The %s-egonetwork of mentions has %i nodes and %i edges' %(nego,len(Ge.nodes()),len(Ge.edges()))
print
print 'The %i nodes of the %s-egonetwork of mentions are:' %(len(Ge.nodes()),nego)
print [names[i] for i in Ge.nodes()]
The ego is user G_refugee

The G_refugee-egonetwork of mentions has 15 nodes and 32 edges

The 15 nodes of the G_refugee-egonetwork of mentions are:
['bisso_4', 'shathafadelasha', 'bataleh', 'Not found', 'd_alkhateeb', 'fedaatweets', 'girl_brainy', 'hasan_alomari', 'Rayaaaa95', 'FarahTarifi91', 'G_refugee', 'bu_assaf1', 'BsahaRG', 'rawanhattab', 'HarethHamad']

3.1.1. Visualization

In [50]:
import math
Gee=nx.DiGraph()
for nd in Ge.edges(data=True):
    ed=nd[0]
    de=nd[1]
    att_dici=nd[2]
    if 'weight' in att_dici:
        wei=att_dici['weight']
    else:
        wei=0
    Gee.add_edge(ed.decode('utf-8'),de.decode('utf-8'),weight=wei)
# print nx.is_connected(Gee)   

paris_att=[u'parisunderattack','parisattacks','paris']
# pos=nx.circular_layout(Gee)
# pos=nx.graphviz_layout(Gee)
pos=nx.nx_agraph.graphviz_layout(Gee)
# nx.spring_layout(Gee)


edgewidth=[]
for (u,v,d) in Gee.edges(data=True):
#     edgewidth.append(d['weight']/200.)
    edgewidth.append(1+math.log(d['weight']))

plt.figure(figsize=(20,18))
# paris_att=[u'parisattacks',u'paris']
cols=[]
ggroups=[]
for nd in Gee.nodes():
    
    if nd in paris_att:
        cols.append('r')
        ggroups.append(0)
    elif nd == ego:
        
        cols.append('m')
        ggroups.append(1)

    else:
        cols.append('g')
        ggroups.append(2)
#     print nd,cols
# cols=['r' if nd in paris_att else 'g' for nd in Gee.nodes() ]
# ggroups=[0 if nd in paris_att else 1 for nd in Gee.nodes()]
# print cols
# print cols
labels={i:names[i] if names[i]!='Not found' else i for i in Ge.nodes()}
nn1=nx.draw_networkx_nodes(Gee,pos, node_size=500,node_color =cols,alpha=0.35) #with_labels=True,
nn2=nx.draw_networkx_edges(Gee,pos,edge_color='b',width=edgewidth,alpha=0.35)
nn3=nx.draw_networkx_labels(Gee,pos,labels=labels,font_size=18,font_color="k")
naxis=plt.axis('off')
In [51]:
# ggroups=[0 if nd in paris_att else 1 for nd in Gee.nodes()]
# print ggroups
# vis=plot_light(Gee,label=2,group=ggroups,size=5)
# vis

3.1.2. Centralities

In [52]:
# pos=nx.spring_layout(Gee)
centrali=draw_centralities_subplots_dir(Gee,pos,withLabels=True,labfs=10,figsi=(15,22),ealpha=0.25,vals=True)