Network Analysis in R

Michael Taylor

2018/06/06

library(downloader)
library(igraph)
library(igraphdata)
knitr::opts_chunk$set(cache=TRUE)

Creating an igraph object

Here you will learn how to create an igraph ‘object’ from data stored in an edgelist. The data are friendships in a group of students. You will also learn how to make a basic visualization of the network.

Each row of the friends dataframe represents an edge in the network.

url <- "https://assets.datacamp.com/production/course_4474/datasets/friends.csv"
filename <- basename(url)
if (!file.exists(filename)) download(url,destfile=filename)
friends <- read.csv(filename)
# Inspect the first few rows of the dataframe 'friends'
head(friends)
##    name1  name2
## 1 Jessie Sidney
## 2 Jessie  Britt
## 3 Sidney  Britt
## 4 Sidney Donnie
## 5   Karl  Berry
## 6 Sidney   Rene
# Convert friends dataframe to a matrix
friends.mat <- as.matrix(friends)

# Convert friends matrix to an igraph object
g <- graph.edgelist(friends.mat, directed = FALSE)


# Make a very basic plot of the network
plot(g)

Counting vertices and edges

A lot of basic information about a network can be extracted from an igraph object. In this exercise you will learn how to count the vertices and edges from a network by applying several functions to the graph object g.

Each row of the friends dataframe represents an edge in the network.

# Subset vertices and edges
V(g)
## + 16/16 vertices, named, from ebadeed:
##  [1] Jessie  Sidney  Britt   Donnie  Karl    Berry   Rene    Shayne 
##  [9] Elisha  Whitney Odell   Lacy    Eugene  Jude    Rickie  Tommy
cat("\n\n")
E(g)
## + 27/27 edges from ebadeed (vertex names):
##  [1] Jessie --Sidney  Jessie --Britt   Sidney --Britt   Sidney --Donnie 
##  [5] Karl   --Berry   Sidney --Rene    Britt  --Rene    Sidney --Shayne 
##  [9] Sidney --Elisha  Sidney --Whitney Jessie --Whitney Donnie --Odell  
## [13] Sidney --Odell   Rene   --Whitney Donnie --Shayne  Jessie --Lacy   
## [17] Rene   --Lacy    Elisha --Eugene  Eugene --Jude    Berry  --Odell  
## [21] Odell  --Rickie  Karl   --Odell   Britt  --Lacy    Elisha --Jude   
## [25] Whitney--Lacy    Britt  --Whitney Karl   --Tommy
# Count number of edges
gsize(g)
## [1] 27
# Count number of vertices
gorder(g)
## [1] 16

Node attributes and subsetting

In this exercise you will learn how to add attributes to vertices in the network and view them.

genders <- c("M", "F", "F", "M", "M", "M", "F", "M", "M", "F", "M", "F", "M", "F", "M", "M")
ages <- c(18, 19, 21, 20, 22, 18, 23, 21, 22, 20, 20, 22, 21, 18, 19, 20)
# Create new vertex attribute called 'gender'
g <- set_vertex_attr(g, "gender", value = genders)
# Create new vertex attribute called 'age'
g <- set_vertex_attr(g, "age", value = ages)
# View all vertex attributes in a list
vertex_attr(g)
## $name
##  [1] "Jessie"  "Sidney"  "Britt"   "Donnie"  "Karl"    "Berry"   "Rene"   
##  [8] "Shayne"  "Elisha"  "Whitney" "Odell"   "Lacy"    "Eugene"  "Jude"   
## [15] "Rickie"  "Tommy"  
## 
## $gender
##  [1] "M" "F" "F" "M" "M" "M" "F" "M" "M" "F" "M" "F" "M" "F" "M" "M"
## 
## $age
##  [1] 18 19 21 20 22 18 23 21 22 20 20 22 21 18 19 20
# View attributes of first five vertices in a dataframe
V(g)[[1:5]] 
## + 5/16 vertices, named, from ebadeed:
##     name gender age
## 1 Jessie      M  18
## 2 Sidney      F  19
## 3  Britt      F  21
## 4 Donnie      M  20
## 5   Karl      M  22

Edge attributes and subsetting

In this exercise you will learn how to add attributes to edges in the network and view them. For instance, we will add the attribute ‘hours’ that represents how many hours per week each pair of friends spend with each other.

hours <- c(1, 2, 2, 1, 2, 5, 5, 1, 1, 3, 2, 1, 1, 5, 1, 2, 4, 1, 3, 1, 1, 1, 4, 1, 3, 3, 4)
g <- set.edge.attribute(g, 'hours', value = hours)
edge_attr(g)
## $hours
##  [1] 1 2 2 1 2 5 5 1 1 3 2 1 1 5 1 2 4 1 3 1 1 1 4 1 3 3 4
# Find all edges that include "Britt"
E(g)[[inc('Britt')]] 
## + 5/27 edges from ebadeed (vertex names):
##      tail    head tid hid hours
## 2  Jessie   Britt   1   3     2
## 3  Sidney   Britt   2   3     2
## 7   Britt    Rene   3   7     5
## 23  Britt    Lacy   3  12     4
## 26  Britt Whitney   3  10     3

View all edges where the attribute hours is greater than or equal to 4 hours.

# Find all pairs that spend 4 or more hours together per week
E(g)[[hours>=4]]  
## + 6/27 edges from ebadeed (vertex names):
##      tail    head tid hid hours
## 6  Sidney    Rene   2   7     5
## 7   Britt    Rene   3   7     5
## 14   Rene Whitney   7  10     5
## 17   Rene    Lacy   7  12     4
## 23  Britt    Lacy   3  12     4
## 27   Karl   Tommy   5  16     4

Visualizing attributes

In this exercise we will learn how to create igraph objects with attributes directly from dataframes and how to visualize attributes in plots. We will use a second network of friendship connections between students.

 url <- "https://assets.datacamp.com/production/course_4474/datasets/friends1_edges.csv"
filename <- basename(url)
if (!file.exists(filename)) download(url,destfile=filename)
friends1_edges <- read.csv(filename)
url <- "https://assets.datacamp.com/production/course_4474/datasets/friends1_nodes.csv"
filename <- basename(url)
if (!file.exists(filename)) download(url,destfile=filename)
friends1_nodes <- read.csv(filename)
# Create an igraph object with attributes directly from dataframes
g1 <- graph_from_data_frame(d = friends1_edges, 
                            vertices = friends1_nodes, 
                            directed = FALSE)
# Subset edges greater than or equal to 5 hours
E(g1)[[hours>=5]] 
## + 4/25 edges from 26b57dc (vertex names):
##         tail      head tid hid hours
## 5     Kelley Valentine   3   6     5
## 8     Ronald   Jasmine   4   8     5
## 12 Valentine     Perry   6  15     5
## 15   Jasmine      Juan   8   9     6
# Plot network and color vertices by gender
V(g1)$color <- ifelse(V(g1)$gender == 'F', "orange", "dodgerblue")

Plot the network with vertices colored by gender and make label names black

plot(g1, vertex.label.color = "black")

igraph network layouts

The igraph package provides several built in layout algorithms for network visualization. Depending upon the size of a given network different layouts may be more effective in communicating the structure of the network. Ideally the best layout is the one that minimizes the number of edges that cross each other in the network. In this exercise you will explore just a few of the many default layout algorithms. Re-executing the code for each plot will lead to a slightly different version of the same layout type. Doing this a few times can help to find the best looking visualization for your network.

# Plot the graph object g1 in a circle layout
plot(g1, vertex.label.color = "black", layout = layout_in_circle(g1))

# Plot the graph object g1 in a Fruchterman-Reingold layout 
plot(g1, vertex.label.color = "black", layout = layout_with_fr(g1))

# Plot the graph object g1 in a Tree layout 
m <- layout_as_tree(g1)
plot(g1, vertex.label.color = "black", layout = m)

Choosing a correct layout can be bewildering. Fortunately igraph has a function layout_nicely() that tries to choose the most appropriate layout function for a given graph object. Use this function to produce the matrix m1 and plot the network using these coordinates.

# Plot the graph object g1 using igraph's chosen layout 
m1 <- layout_nicely(g1)
plot(g1, vertex.label.color = "black", layout = m1)

Visualizing edges

In this exercise you will learn how to change the size of edges in a network based on their weight, as well as how to remove edges from a network which can sometimes be helpful in more effectively visualizing large and highly clustered networks. In this introductory chapter, we have just scratched the surface of what’s possible in visualizing igraph networks. You will continue to develop these skills in future chapters.

*Create a vector w1 of edge weights based on the number of hours friends spend together.

# Create a vector of weights based on the number of hours each pair spend together
w1 <- E(g1)$hours

Plot the network ensuring that the edge.width is set to the vector of weights you just created. Using edge.color = 'black' ensures that all edges will be black.

# Plot the network varying edges by weights
m1 <- layout_nicely(g1)
plot(g1, 
        vertex.label.color = "black", 
        edge.color = 'black',
        edge.width = w1,
        layout = m1)

Next, create a new graph object g2 that is the g1 network but with all edges of that are of weight less than two hours removed. This is done by using delete_edges() which takes two arguments. The first is the graph object and the second is the subset of edges to be removed. In this case, you will remove any edges that have a value of less than two hours.

# Create a new igraph object only including edges from the original graph that are greater than 2 hours long 
g2 <- delete_edges(g1, E(g1)[hours < 2])

Finally, plot the new network g2 using the appropriate vector of edge widths and layout.

# Plot the new graph 
w2 <- E(g2)$hours
m2 <- layout_nicely(g2)

plot(g2, 
     vertex.label.color = "black", 
     edge.color = 'black',
     edge.width = w2,
     layout = m2)

Identifying important vertices in a network

Directed igraph objects

url <- "https://assets.datacamp.com/production/course_4474/datasets/measles.csv"
filename <- basename(url)
if (!file.exists(filename)) download(url,destfile=filename)
measles <- read.csv(filename)

In this exercise you will learn how to create a directed graph from a dataframe, how to inspect whether a graph object is directed and/or weighted and how to extract those vertices at the beginnning and end of directed edges.

  • Convert the dataframe measles into an igraph graph object using the function graph_from_data_frame() and ensure that it will be a directed graph by setting the second argument to TRUE.
# Get the graph object
g <- graph_from_data_frame(measles, directed = TRUE)

Check if the graph object is directed by using is.directed().

# is the graph directed?
is.directed(g)
## [1] TRUE

Examine whether the edges of the graph object are already weighted by using is.weighted().

# Is the graph weighted?
is.weighted(g)
## [1] FALSE

Subset each vertex from which each edge originates by using head_of(). This function takes two arguments, the first being the graph object and the second the edges to examine. For all edges you can use E(g).

# Where does each edge originate from?
table(
  head_of(g, E(g))
  )
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  17  18  19 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  56  57  58  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 184 185 186 187 
##   1   1   1   1

Identifying edges for each vertex

In this exercise you will learn how to identify particular edges. You will learn how to determine if an edge exists between two vertices as well as finding all vertices connected in either direction to a given vertex.

  • First make a visualization of this network using plot(). You will improve this visualization later. It can be useful to visualize the network before analysis. To improve visibility of the plot of this network, you should make the vertex size equal to 0 and the edge arrow size equal to 0.1.
# Make a basic plot
plot(g, 
     vertex.label.color = "black", 
     edge.color = 'gray77',
     vertex.size = 0,
     edge.arrow.size = 0.1,
     layout = layout_nicely(g))

  • Check if there is an edge going in each direction between vertex 184 to vertex 178 using single brackets subsetting of the graph object. If a 1 is returned that indicates TRUE there is an edge. If a 0 is returned that indicates FALSE there is not an edge.
# Is there an edge going from vertex 184 to vertex 178?
g['184', '178']
## [1] 1
# Is there an edge going from vertex 178 to vertex 184?
g['178', '184']
## [1] 0
  • Using the function incident() identify those edges that go in either direction from vertex 184 or only those going out from vertex 184. The first argument should be the graph object, the second should be the vertex to examine and the third argument the mode indicating the direction. Choose from 'all', 'in' and 'out'.
# Show all edges going to or from vertex 184
incident(g, '184', mode = c("all"))
## + 6/184 edges from 3005261 (vertex names):
## [1] 184->45  184->182 184->181 184->178 184->183 184->177
# Show all edges going out from vertex 184
incident(g, '184', mode = c("out"))
## + 6/184 edges from 3005261 (vertex names):
## [1] 184->45  184->182 184->181 184->178 184->183 184->177

Neighbors

Often in network analysis it is important to explore the patterning of connections that exist between vertices. One way is to identify neighboring vertices of each vertex. You can then determine which neighboring vertices are shared even by unconnected vertices indicating how two vertices may have an indirect relationship through others. In this exercise you will learn how to identify neighbors and shared neighbors between pairs of vertices.

  • Using the function neighbors() identify the vertices that are connected in any manner to vertex 12, those vertices that direct an edge to vertex 12 and those vertices that receive a directed edge from vertex 12. This can be achieved by choosing the correct value in the argument mode. Choose from 'all', 'in’ and 'out'.
# Identify all neighbors of vertex 12 regardless of direction
neighbors(g, '12', mode = c('all'))
## + 5/187 vertices, named, from 3005261:
## [1] 45  13  72  89  109
# Identify other vertices that direct edges towards vertex 12
neighbors(g, '12', mode = c('in'))
## + 1/187 vertex, named, from 3005261:
## [1] 45

Determine if vertices 42 and 124 have a neighbor in common. Create a vector n1 of those vertices that receive an edge from vertex 42 and a vector n2 of those vertices that direct an edge to vertex 124 using neighbors(). Next use intersection() to identify if there are any vertices that exist in both n1 and `n2.

# Identify any vertices that receive an edge from vertex 42 and direct an edge to vertex 124
n1 <- neighbors(g, '42', mode = c('out'))
n2 <- neighbors(g, '124', mode = c('in'))
intersection(n1, n2)
## + 1/187 vertex, named, from 3005261:
## [1] 7

Distances between vertices

The inter-connectivity of a network can be assessed by examining the number and length of paths between vertices. A path is simply the chain of connections between vertices. The number of intervening edges between two vertices represents the geodesic distance between vertices. Vertices that are connected to each other have a geodesic distance of 1. Those that share a neighbor in common but are not connected to each other have a geodesic distance of 2 and so on. In directed networks, the direction of edges can be taken into account. If two vertices cannot be reached via following directed edges they are given a geodesic distance of infinity. In this exercise you will learn how to find the longest paths between vertices in a network and how to discern those vertices that are within \(n\) connections of a given vertex. For disease transmission networks such as the measles dataset this helps you to identify how quickly the disease spreads through the network.

  • Find the length of the longest path in the network using farthest_vertices().
# Which two vertices are the furthest apart in the graph ?
farthest_vertices(g)
## $vertices
## + 2/187 vertices, named, from 3005261:
## [1] 184 162
## 
## $distance
## [1] 5

Identify the sequence of the path using get_diameter(). This demonstrates the individual children that passed the disease the furthest through the network.

# Shows the path sequence between two furthest apart vertices.
get_diameter(g)
## + 6/187 vertices, named, from 3005261:
## [1] 184 178 42  7   123 162

Use ego() to find all vertices that are reachable within 2 connections of vertex 42 and then those that can reach vertex 42 within two connections. The first argument of ego() is the graph object, the second argument is the maximum number of connections between the vertices, the third argument is the vertex of interest, and the fourth argument determines if you are considering connections going out or into the vertex of interest.

# Identify vertices that are reachable within two connections from vertex 42
ego(g, 2, '42', mode = c('out'))
## [[1]]
## + 13/187 vertices, named, from 3005261:
##  [1] 42  7   106 43  123 101 120 124 125 128 129 108 127
# Identify vertices that can reach vertex 42 within two connections
ego(g, 2, '42', mode = c('in'))
## [[1]]
## + 3/187 vertices, named, from 3005261:
## [1] 42  178 184

Identifying key vertices

Perhaps the most straightforward measure of vertex importance is the degree of a vertex. The out-degree of a vertex is the number of other individuals to which a vertex has an outgoing edge directed to. The in-degree is the number of edges received from other individuals. In the measles network, individuals that infect many other individuals will have a high out-degree. In this exercise you will identify whether indviduals infect equivalent amount of other children or if there are key children who have high out-degrees and infect many other children.

  • Calculate the out-degree of each vertex using the function degree(). The first argument is the network graph object and the second argument is the mode which should be one of 'out', 'in' or 'all'. Assign the output of this function to the object g.outd.
# Calculate the out-degree of each vertex
g.outd <- degree(g, mode = c("out"))
  • View a summary of the out-degrees of all individuals using the function table() on the vector object g.outd.
# View a summary of out-degree
table(g.outd)
## g.outd
##   0   1   2   3   4   6   7   8  30 
## 125  21  16  12   6   2   3   1   1
  • Make a histogram of the out-degrees using the function hist() on the vector object g.outd.
# Make a histogram of out-degrees
hist(g.outd, breaks = 30)

Determine which vertex has the highest out-degree in the network using the function which.max() on the vector object g.outd.

# Find the vertex that has the maximum out-degree
which.max(g.outd)
## 45 
##  1

Betweenness

Another measure of the importance of a given vertex is its betweenness. This is an index of how frequently the vertex lies on shortest paths between any two vertices in the network. It can be thought of as how critical the vertex is to the flow of information through a network. Individuals with high betweenness are key bridges between different parts of a network. In our measles transmission network, vertices with high betweenness are those children who were central to passing on the disease to other parts of the network. In this exercise, you will identify the betweenness score for each vertex and then make a new plot of the network adjusting the vertex size by its betweenness score to highlight these key vertices.

  • Calculate the betweenness of each vertex using the function betweenness() on the graph object g. Ensure that the scores are calculated for a directed network. The results of this function will be assigned as g.b.
# Calculate betweenness of each vertex
g.b <- betweenness(g, directed = TRUE)
  • Visually examine the distribution of betweenness scores using the function hist().
# Show histogram of vertex betweenness
hist(g.b, breaks = 80)

  • Use plot() to make a plot of the network based on betweenness scores. The vertex labels should be made NA so that they do not appear. The vertex size attribute should be one plus the square-root of the betweenness scores that are in object g.b. Given the huge disparity in betweenness scores in this network, normalizing the scores in this manner ensures that all nodes can be viewed but their relative importance is still identifiable.
# Create plot with vertex size determined by betweenness score
plot(g, 
     vertex.label = NA,
     edge.color = 'black',
     vertex.size = sqrt(g.b)+1,
     edge.arrow.size = 0.05,
     layout = layout_nicely(g))

Visualizing important nodes and edges

One issue with the measles dataset is that there are three individuals for whom no information is known about who infected them. One of these individuals (vertex 184) appears ultimately responsible for spreading the disease to many other individuals even though they did not directly infect too many indviduals. However, because vertex 184 has no incoming edge in the network they appear to have low betweenness. One way it is possible to explore the importance of this vertex is by visualizing the geodesic distances of connections going out from this individual. In this exercise you shall create a plot of these distances from this patient zero.

  • Use make_ego_graph() to create a subset of our network comprised of vertices that are connected to vertex 184. The first argument is the original graph g. The second argument is the maximal number of connections that any vertex needs to be connected to our vertex of interest. In this case we can use diameter() to return the length of the longest path in the network. The third argument is our vertex of interest which should be 184. The final argument is the mode. In this instance you can include all connections regardless of direction.
# Make an ego graph
g184 <- make_ego_graph(g, diameter(g), nodes = '184', mode = c("all"))[[1]]

Create an object dists that contains the geodesic distance of every vertex from vertex 184. Use the function distances() to calculate this.

# Get a vector of geodesic distances of all vertices from vertex 184 
dists <- distances(g184, "184")

Assign the attribute color to each vertex. Each color will be selected based on its geodesic distance. The color palette colors is a length equal to the maximal geodesic distance plus one. This is so that vertices of the same distance are plotted in the same color and patient zero also has its own color.

# Create a color palette of length equal to the maximal geodesic distance plus one.
colors <- c("black", "red", "orange", "blue", "dodgerblue", "cyan")

# Set color attribute to vertices of network g184.
V(g184)$color <- colors[dists+1]

Use plot() to visualize the network g184. The vertex label should be the geodesic distances dists.

# Visualize the network based on geodesic distance from vertex 184 (patient zero).
plot(g184, 
     vertex.label = dists, 
     vertex.label.color = "white",
     vertex.label.cex = .6,
     edge.color = 'black',
     vertex.size = 7,
     edge.arrow.size = .05,
     main = "Geodesic Distances from Patient Zero"
     )

Characterizing network structures

Forrest Gump network

In this chapter you will use a social network based on the movie Forrest Gump. Each edge of the network indicates that those two characters were in at least one scene of the movie together. Therefore this network is undirected. To familiarize yourself with the network, you will first create the network object from the raw dataset. Then, you will identify key vertices using a measure called eigenvector centrality. Individuals with high eigenvector centrality are those that are highly connected to other highly connected individuals. You will then make an exploratory visualization of the network.

  • Inspect the first few rows of the dataframe gump using head().
# Inspect Forrest Gump Movie dataset
head(gump)
##              V1        V2
## 1 ABBIE HOFFMAN     JENNY
## 2 ABBIE HOFFMAN POLICEMAN
## 3     ANCHORMAN   FORREST
## 4     ANCHORMAN    LT DAN
## 5     ANCHORMAN     MARGO
## 6     ANCHORMAN  MRS GUMP
  • Make an undirected network using graph_from_data_frame().
# Make an undirected network
g <- graph_from_data_frame(gump, directed = FALSE)
  • Identify the key vertices using the function eigen_centrality() and assign the results of this to the object g.ec. Next identify which individual has the highest eigenvector centrality using which.max(). The values of the centrality scores are stored in g.ec$vector.
# Identify key nodes using eigenvector centrality
g.ec <- eigen_centrality(g)
which.max(g.ec$vector)
## FORREST 
##      36

Make a plot of the Forrest Gump Network using plot(). Make the size of the vertices equal to 25 times the eigenvector centrality values that are stored in g.ec$vector.

# Plot Forrest Gump Network
plot(g,
vertex.label.color = "black", 
vertex.label.cex = 0.6,
vertex.size = 25*(g.ec$vector),
edge.color = 'gray88',
main = "Forrest Gump Network"
)

Network density and average path length

The first graph level metric you will explore is the density of a graph. This is essentially the proportion of all potential edges between vertices that actually exist in the network graph. It is an indicator of how well connected the vertices of the graph are.

Another measure of how interconnected a network is average path length. This is calculated by determining the mean of the lengths of the shortest paths between all pairs of vertices in the network. The longest path length between any pair of vertices is called the diameter of the network graph. You will calculate the diameter and average path length of the original graph g.

  • Using the function edge_density() calculate the density of the graph gand assign this value to the vector gd.
# Get density of a graph
gd <- edge_density(g)
  • Use diameter() to calculate the diameter of the original graph g.
# Get the diameter of the graph g
diameter(g, directed = FALSE)
## [1] 4

Assign the average path length of g to g.apl with the function mean_distance().

# Get the average path length of the graph g
g.apl <- mean_distance(g, directed = FALSE)
g.apl
## [1] 1.994967

Graph density quiz

If a graph has 7 vertices there are 21 possible edges in the network. If 14 edges exist, what is the density of the network?

potential edges = 21 actual edges = 14

The density is the proportion of edges that actuall do exist in a network out of all those that potentially could exist between every pair of vertices

print(paste("This network has a density of", round(14 / 21, digits = 2)))
## [1] "This network has a density of 0.67"

Random graphs

Generating random graphs is an important method for investigating how likely or unlikely other network metrics are likely to occur given certain properties of the original graph. The simplest random graph is one that has the same number of vertices as your original graph and approximately the same density as the original graph. Here you will create one random graph that is based on the original Forrest Gump Network.

  • Generate a random graph using the function erdos.renyi.game(). The first argument n should be the number of nodes of the graph g which can be calculated using gorder(), the second argument p.or.m should be the density of the graph g which you previously stored as the object gd. The final argument is set as type='gnp' to tell the function that you are using the density of the graph to generate a random graph. Store this new graph as the vector g.random.
# Create one random graph with the same number of nodes and edges as g
g.random <- erdos.renyi.game(n = gorder(g), p.or.m = gd, type = "gnp")
g.random
## IGRAPH 2e92cb7 U--- 94 260 -- Erdos renyi (gnp) graph
## + attr: name (g/c), type (g/c), loops (g/l), p (g/n)
## + edges from 2e92cb7:
##  [1]  6-- 7  4-- 9  7--10  3--11  4--11 12--17  5--18  7--18 14--22  6--23
## [11] 16--23 21--23 11--25  8--28 11--28  4--29 23--29 13--30 26--30 23--31
## [21] 25--31  7--32 22--32  1--33 31--33 13--34 20--34 23--34  6--35 19--35
## [31] 22--35 27--36 12--37 23--37 26--37 10--38  2--39 15--39 18--39 24--39
## [41] 36--39  6--40 22--40 12--41 22--42 31--42  1--43 25--43 42--43 12--44
## [51] 34--45 18--46 43--46  8--47 21--47 25--47 46--47 30--48 42--48 25--49
## [61] 28--49  5--50 19--50 24--50 36--50 13--51 27--51 32--51 35--51 39--51
## [71] 49--51 21--52 22--52 25--52 35--52 28--53 37--53 13--54 19--55 52--55
## + ... omitted several edges
plot(g.random)

Get the density of the random graph g.random. You will notice if you generate a random graph a few times that this value will slightly vary but be approximately equal to the density of your original graph g from the previous exercise stored in the object gd.

# Get density of new random graph `g.random`
edge_density(g.random)
## [1] 0.05948296

Calculate the average path length of the random graph g.random.

#Get the average path length of the random graph g.random
mean_distance(g.random, directed = FALSE)
## [1] 2.809655

Network randomizations

In the previous exercise you may have noticed that the average path length of the Forrest Gump network was smaller than the average path length of the random network. If you ran the code a few times you will have noticed that it is nearly always lower in the Forrest Gump network than the random network. What this suggests is that the Forrest Gump network is more highly interconnected than each random network even though the random networks have the same number of vertices and approximately identical graph densities. Rather than re-running this code many times, you can more formally address this by creating 1000 random graphs based on the number of vertices and density of the original Forrest Gump graph. Then, you can see how many times the average path length of the random graphs is less than the original Forrest Gump network. This is called a randomization test.

  • Generate 1000 random graphs of the original graph g by executing the code that creates the list object gl and the for loop.
# Generate 1000 random graphs
gl <- vector('list', 1000)

for(i in 1:1000){
gl[[i]] <- erdos.renyi.game(n = gorder(g), p.or.m = gd, type = "gnp")
}

Calculate the average path length of the 1000 random graphs using lapply(). Create a vector gl.apls of these 1000 values by executing the code that uses unlist().

# Calculate average path length of 1000 random graphs
gl.apl <- lapply(gl, mean_distance, directed = FALSE)
gl.apls <- unlist(gl.apl)

Plot a histogram of the average path lengths of the 1000 random graphs using hist() on the vector gl.apls. Add a red dashed line to the plot using abline() with the x-intercept being the value of the average path length of the original graph g.apl. You calculated this value in the previous exercise.

# Plot the distribution of average path lengths
hist(gl.apls, xlim = range(c(1.5, 6)))
abline(v = mean_distance(g, directed = FALSE), col = "red", lty = 3, lwd=2)

Calculate the proportion of times that the values of the average path length of random graphs gl.apls are lower than the value of the original graph g.apl. This is essentially the probability that we would expect our observed average path length by chance given the original density and number of vertices of the original graph.

# Calculate the proportion of graphs with an average path length lower than our observed
sum(gl.apl < mean(gl.apls))/1000
## [1] 0.52

Randomization tests enable you to identify features of your original network that are particularly unusual.

Triangles and transitivity

Another important measure of local connectivity in a network graph involves investigating triangles (also known as triads). In this exercise you will find all closed triangles that exist in a network. This means that an edge exists between three given vertices. You can then calculate the transitivity of the network. This is equivalent to the proportion of all possible triangles in the network that are closed. You will also learn how to identify the number of closed triangles that any given vertex is a part of and its local transitivity - that is, the proportion of closed triangles that the vertex is a part of given the theoretical number of triangles it could be a part of.

  • Show a matrix of all possible triangles in the Forrest Gump network g using the function triangles().
# Show all triangles in the network.
matrix(triangles(g), nrow = 3)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,]   36   36   36   36   36   36   36   36   36    36    36    36    36
## [2,]    1    1    1    1    2    4    4    6    6     6     6     7     7
## [3,]   83   38   39   66   68   57   24   27   75    40    45     8    69
##      [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,]    36    36    36    36    36    36    36    36    36    36    36
## [2,]     8    11    11    11    12    12    13    14    14    14    14
## [3,]    69    12    13    70    70    13    70     4    19    24    71
##      [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35]
## [1,]    36    36    36    36    36    36    36    36    36    36    36
## [2,]    14    14    14    14    14    15    15    17    17    18    18
## [3,]    65    57    62    63    64    21    72    22    42     5    28
##      [,36] [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46]
## [1,]    36    36    36    36    36    36    36    36    36    36    36
## [2,]    19    19    21    22    24    26    26    26    26    26    26
## [3,]    71    63    72    42    57    73    52    47    48    49    50
##      [,47] [,48] [,49] [,50] [,51] [,52] [,53] [,54] [,55] [,56] [,57]
## [1,]    36    36    36    36    36    36    36    36    36    36    36
## [2,]    27    27    27    28    28    30    30    30    34    38    38
## [3,]    75    45    40     5    90    84    61    51    88    83    66
##      [,58] [,59] [,60] [,61] [,62] [,63] [,64] [,65] [,66] [,67] [,68]
## [1,]    36    36    36    36    36    36    36    36    36    36    36
## [2,]    38    39    39    40    40    41    41    41    41    41    41
## [3,]    39    83    66    75    45     1     3     6     7     8    11
##      [,69] [,70] [,71] [,72] [,73] [,74] [,75] [,76] [,77] [,78] [,79]
## [1,]    36    36    36    36    36    36    36    36    36    36    36
## [2,]    41    41    41    41    41    41    41    41    41    41    41
## [3,]    12    13    26    27    30    32    33    86    37    38    39
##      [,80] [,81] [,82] [,83] [,84] [,85] [,86] [,87] [,88] [,89] [,90]
## [1,]    36    36    36    36    36    36    36    36    36    36    36
## [2,]    41    41    41    41    41    41    41    41    41    41    41
## [3,]    40    43    44    45    47    48    49    50    51    52    53
##      [,91] [,92] [,93] [,94] [,95] [,96] [,97] [,98] [,99] [,100] [,101]
## [1,]    36    36    36    36    36    36    36    36    36     36     36
## [2,]    41    41    41    41    41    41    41    41    41     41     41
## [3,]    54    56    58    61    66    69    70    73    74     75     79
##      [,102] [,103] [,104] [,105] [,106] [,107] [,108] [,109] [,110] [,111]
## [1,]     36     36     36     36     36     36     36     36     36     36
## [2,]     41     41     41     43     43     43     44     44     44     44
## [3,]     82     83     84     82     54     53      2      3      9     14
##      [,112] [,113] [,114] [,115] [,116] [,117] [,118] [,119] [,120] [,121]
## [1,]     36     36     36     36     36     36     36     36     36     36
## [2,]     44     44     44     44     44     44     44     44     44     44
## [3,]     17     19     22     82     71     42     43     53     62     63
##      [,122] [,123] [,124] [,125] [,126] [,127] [,128] [,129] [,130] [,131]
## [1,]     36     36     36     36     36     36     36     36     36     36
## [2,]     44     44     45     47     47     47     47     47     48     48
## [3,]     64     65     75     73     52     50     48     49     73     52
##      [,132] [,133] [,134] [,135] [,136] [,137] [,138] [,139] [,140] [,141]
## [1,]     36     36     36     36     36     36     36     36     36     36
## [2,]     48     48     49     49     49     50     50     51     51     52
## [3,]     50     49     73     52     50     73     52     84     61     73
##      [,142] [,143] [,144] [,145] [,146] [,147] [,148] [,149] [,150] [,151]
## [1,]     36     36     36     36     36     36     36     36     36     36
## [2,]     53     54     54     56     58     59     60     60     60     60
## [3,]     82     87     56     89     79     92      2     20     23     25
##      [,152] [,153] [,154] [,155] [,156] [,157] [,158] [,159] [,160] [,161]
## [1,]     36     36     36     36     36     36     36     36     36     36
## [2,]     60     60     60     61     62     62     62     62     63     64
## [3,]     31     81     43     84     71     19     35     63     71      3
##      [,162] [,163] [,164] [,165] [,166] [,167] [,168] [,169] [,170] [,171]
## [1,]     36     36     36     36     36     36     36     36     36     36
## [2,]     64     64     64     64     64     65     65     65     65     65
## [3,]     71     19     63     62     46      4     71     19     24     64
##      [,172] [,173] [,174] [,175] [,176] [,177] [,178] [,179] [,180] [,181]
## [1,]     36     36     36     36     41     41     41     41     41     41
## [2,]     65     65     65     66      1      1      1      1      6      6
## [3,]     63     57     62     83     83     38     39     66     27     75
##      [,182] [,183] [,184] [,185] [,186] [,187] [,188] [,189] [,190] [,191]
## [1,]     41     41     41     41     41     41     41     41     41     41
## [2,]      6      6      7      7      8     11     11     11     12     12
## [3,]     40     45      8     69     69     12     13     70     70     13
##      [,192] [,193] [,194] [,195] [,196] [,197] [,198] [,199] [,200] [,201]
## [1,]     41     41     41     41     41     41     41     41     41     41
## [2,]     13     26     26     26     26     26     26     27     27     27
## [3,]     70     73     52     47     48     49     50     75     45     40
##      [,202] [,203] [,204] [,205] [,206] [,207] [,208] [,209] [,210] [,211]
## [1,]     41     41     41     41     41     41     41     41     41     41
## [2,]     30     30     30     38     38     38     39     39     40     40
## [3,]     84     61     51     83     66     39     83     66     75     45
##      [,212] [,213] [,214] [,215] [,216] [,217] [,218] [,219] [,220] [,221]
## [1,]     41     41     41     41     41     41     41     41     41     41
## [2,]     43     43     43     44     44     44     44     45     47     47
## [3,]     82     54     53      3     82     43     53     75     73     52
##      [,222] [,223] [,224] [,225] [,226] [,227] [,228] [,229] [,230] [,231]
## [1,]     41     41     41     41     41     41     41     41     41     41
## [2,]     47     47     47     48     48     48     48     49     49     49
## [3,]     50     48     49     73     52     50     49     73     52     50
##      [,232] [,233] [,234] [,235] [,236] [,237] [,238] [,239] [,240] [,241]
## [1,]     41     41     41     41     41     41     41     41     41     41
## [2,]     50     50     51     51     52     53     54     58     58     61
## [3,]     73     52     84     61     73     82     56     10     79     84
##      [,242] [,243] [,244] [,245] [,246] [,247] [,248] [,249] [,250] [,251]
## [1,]     41     44     44     44     44     44     44     44     44     44
## [2,]     66      2     14     14     14     14     14     14     17     17
## [3,]     83     67     19     71     65     62     63     64     22     42
##      [,252] [,253] [,254] [,255] [,256] [,257] [,258] [,259] [,260] [,261]
## [1,]     44     44     44     44     44     44     44     44     44     44
## [2,]     19     19     22     43     43     53     62     62     62     63
## [3,]     71     63     42     82     53     82     71     19     63     71
##      [,262] [,263] [,264] [,265] [,266] [,267] [,268] [,269] [,270] [,271]
## [1,]     44     44     44     44     44     44     44     44     44     44
## [2,]     64     64     64     64     64     65     65     65     65     65
## [3,]      3     71     19     63     62     71     19     64     63     62
##      [,272] [,273] [,274] [,275] [,276] [,277] [,278] [,279] [,280] [,281]
## [1,]     14     14     14     14     14     14     14     14     14     14
## [2,]      4      4     19     19     24     65     65     65     65     65
## [3,]     57     24     71     63     57      4     71     19     24     64
##      [,282] [,283] [,284] [,285] [,286] [,287] [,288] [,289] [,290] [,291]
## [1,]     14     14     14     14     14     14     14     14     14     14
## [2,]     65     65     65     62     62     62     63     64     64     64
## [3,]     63     57     62     71     19     63     71     71     19     63
##      [,292] [,293] [,294] [,295] [,296] [,297] [,298] [,299] [,300] [,301]
## [1,]     14     65     65     65     65     65     65     65     65     65
## [2,]     64      4      4     19     19     24     64     64     64     64
## [3,]     62     57     24     71     63     57     71     19     63     62
##      [,302] [,303] [,304] [,305] [,306] [,307] [,308] [,309] [,310] [,311]
## [1,]     65     65     65     65     64     64     64     64     64     64
## [2,]     63     62     62     62     19     19     63     62     62     62
## [3,]     71     71     19     63     71     63     71     71     19     63
##      [,312] [,313] [,314] [,315] [,316] [,317] [,318] [,319] [,320] [,321]
## [1,]     62     62     62     19     26     26     26     26     26     26
## [2,]     19     19     63     63     52     47     47     47     47     47
## [3,]     71     63     71     71     73     73     52     50     48     49
##      [,322] [,323] [,324] [,325] [,326] [,327] [,328] [,329] [,330] [,331]
## [1,]     26     26     26     26     26     26     26     26     26     47
## [2,]     48     48     48     48     49     49     49     50     50     52
## [3,]     73     52     50     49     73     52     50     73     52     73
##      [,332] [,333] [,334] [,335] [,336] [,337] [,338] [,339] [,340] [,341]
## [1,]     47     47     47     47     47     47     47     47     47     48
## [2,]     50     50     48     48     48     48     49     49     49     52
## [3,]     73     52     73     52     50     49     73     52     50     73
##      [,342] [,343] [,344] [,345] [,346] [,347] [,348] [,349] [,350] [,351]
## [1,]     48     48     48     48     48     49     49     49     50     43
## [2,]     50     50     49     49     49     52     50     50     52     53
## [3,]     73     52     73     52     50     73     73     52     73     82
##      [,352] [,353] [,354] [,355] [,356] [,357] [,358] [,359] [,360] [,361]
## [1,]      1      1      1      1      1      1      6      6      6      6
## [2,]     38     38     38     39     39     66     27     27     27     40
## [3,]     83     66     39     83     66     83     75     45     40     75
##      [,362] [,363] [,364] [,365] [,366] [,367] [,368] [,369] [,370] [,371]
## [1,]      6      6     27     27     27     38     38     38     39     40
## [2,]     40     45     45     40     40     66     39     39     66     45
## [3,]     45     75     75     75     45     83     83     66     83     75
##      [,372] [,373] [,374] [,375] [,376] [,377] [,378] [,379] [,380] [,381]
## [1,]      4     11     11     11     12     30     30     30     51      7
## [2,]     24     12     12     13     13     61     51     51     61      8
## [3,]     57     70     13     70     70     84     84     61     84     69
##      [,382] [,383] [,384]
## [1,]     17     18     15
## [2,]     22     28     21
## [3,]     42      5     72

Using the function count_triangles(), find how many triangles that the vertex "BUBBA" is a part of. The vids argument refers to the id of the vertex.

# Count the number of triangles that vertex "BUBBA" is in.
count_triangles(g, vids='BUBBA')
## [1] 37

Calculate the global transitivity of the network g using transitivity().

# Calculate  the global transitivity of the network.
g.tr <- transitivity(g)
g.tr
## [1] 0.1918082

Find the local transitivity of vertex "BUBBA" also using the function transitivity(). The type is defined as local to indicate that you are calculating a local rather than global transitivity.

# Calculate the local transitivity for vertex BUBBA.
transitivity(g, vids='BUBBA', type = "local")
## [1] 0.6727273

Transitivity randomizations

As you did for the average path length, let’s investigate if the global transitivity of the Forrest Gump network is significantly higher than we would expect by chance for random networks of the same size and density. You can compare Forrest Gump’s global transitivity to 1000 other random networks.

  • One thousand random networks are stored in the list object gl. Using lapply() and transitivity() calculate the global transitivity of each of these networks. Assign these results to gl.tr.
# Calculate average transitivity of 1000 random graphs
gl.tr <- lapply(gl, transitivity)
  • Using unlist() convert gl.tr to a numeric vector gl.trs.
gl.trs <- unlist(gl.tr)
  • Investigate the summary statistics of the transitivities of the random networks using summary().
# Get summary statistics of transitivity scores
summary(gl.trs)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02992 0.05370 0.06078 0.06109 0.06827 0.09409

Calculate the proportion of random graphs that have a transitivity higher than the transitivity of Forrest Gump’s network, which you previously calculated and assigned to g.tr.

# Calculate the proportion of graphs with a transitivity score higher than Forrest Gump's network.
sum(gl.trs > g.tr)/1000
## [1] 0

Cliques

Identifying cliques is a common practice in undirected networks. In a clique every two unique nodes are adjacent - that means that every individual node is connected to every other individual node in the clique. In this exercise you will identify the largest cliques in the Forrest Gump network. You will also identify the number of maximal cliques of various sizes. A clique is maximal if it cannot be extended to a larger clique.

  • Identify the largest cliques in the network using the function largest_cliques().
# Identify the largest cliques in the network
largest_cliques(g)
## [[1]]
## + 9/94 vertices, named, from 2e1ab54:
## [1] FORREST   STRONGARM BUBBA     DALLAS    LT DAN    MAN       SGT SIMS 
## [8] SOLDIER   SONG     
## 
## [[2]]
## + 9/94 vertices, named, from 2e1ab54:
## [1] FORREST JENNY   EMCEE   MAN #   MAN #1  MAN #2  MAN #3  MAN #5  MEN
  • Determine all the maximal cliques in the network using the function max_cliques(). Assign the output of this function to the list object clq.
# Determine all maximal cliques in the network and assign to object 'clq'
clq <- max_cliques(g)
  • Calculate the length of each of the maximal cliques. Use lapply() to loop through the object clq determining the length() of each object in the list. Then unlist() and use table() to observe how large each of the maximal cliques are.
# Calculate the size of each maximal clique.
table(unlist(lapply(clq, length)))
## 
##  2  3  4  5  6  7  9 
## 12 24  7  2  4  2  2

Visualize largest cliques

Often in network visualization you will need to subset part of a network to inspect the inter-connections of particular vertices. Here, you will create a visualization of the largest cliques in the Forrest Gump network. In the last exercise you determined that there were two cliques of size 9. You will plot these side-by-side after creating two new igraph objects by subsetting out these cliques from the main network. The function induced_subgraph() enables you to choose which vertices to keep in a new network object.

  • Assign the list of the largest cliques in the network to the object lc.
# Assign largest cliques output to object 'lc'
lc <- largest.cliques(g)
  • Create two new undirected subgraphs using the function induced_subgraph(). The first, gs1, should contain only the vertices in the first largest clique. The second, gs2, should contain only the vertices in the second largest clique. This function is wrapped in as.undirected() to ensure that the subgraph is also undirected.
# Create two new undirected subgraphs, each containing only the vertices of each largest clique.
gs1 <- as.undirected(induced_subgraph(g, lc[[1]]))
gs2 <- as.undirected(induced_subgraph(g, lc[[2]]))
  • Visualize the two largest cliques side by side using plot(). First execute the code: par(mfrow=c(1,2)). This is to ensure that the two visualizations sit side-by-side. Make sure that the layout is set to layout.circle() to make the visualization easier to view.
# Plot the two largest cliques side-by-side

par(mfrow=c(1,2)) # To plot two plots side-by-side

plot(gs1,
     vertex.label.color = "black", 
     vertex.label.cex = 0.9,
     vertex.size = 0,
     edge.color = 'gray28',
     main = "Largest Clique 1",
     layout = layout.circle(gs1)
)

plot(gs2,
     vertex.label.color = "black", 
     vertex.label.cex = 0.9,
     vertex.size = 0,
     edge.color = 'gray28',
     main = "Largest Clique 2",
     layout = layout.circle(gs2)
)

Identifying special relationships

Assortativity

In this exercise you will determine the assorativity() of the second friendship network from the first chapter. This is a measure of how preferentially attached vertices are to other vertices with identical attributes. You will also determine the degree assortativity which determines how preferentially attached are vertices to other vertices of a similar degree.

  • Make an exploratory plot of the friendship network object g1 using plot().
# Plot the network
plot(g1)

  • Convert the gender attribute of each vertex to a vector of numbers called values by factorizing and then using as.numeric().
# Convert the gender attribute into a numeric value
values <- as.numeric(factor(V(g1)$gender))
  • Calculate the assortativity based on gender by using the function assortativity(). The first argument should be the graph object g1. The second argument are the values.
# Calculate the assortativity of the network based on gender
assortativity(g1, values)
## [1] 0.1319444
  • Calculate the degree assortativity of the network using assortativity.degree(). The first argument should be the graph object.
# Calculate the assortativity degree of the network
assortativity.degree(g1, directed = FALSE)
## [1] 0.4615385

Using randomizations to assess assortativity

In this exercise you will determine how likely the observed assortativity in the friendship network is given the genders of vertices by performing a randomization procedure. You will randomly permute the gender of vertices in the network 1000 times and recalculate the assortativity for each random network.

  • Use assortativity() to calculate the assortativity of the graph object g1 based on gender using the object values calculated in the previous exercise, and assign this to the object observed.assortativity.
# Calculate the observed assortativity
observed.assortativity <- assortativity(g1, values)
  • Inside the for loop calculate the assortativity of the network g1 using assortativity() while randomly permuting the object values each time with sample().
# Calculate the assortativity of the network randomizing the gender attribute 1000 times
results <- vector('list', 1000)
for(i in 1:1000){
  results[[i]] <- assortativity(g1, sample(values))
}
  • Plot the distribution of assortativity values from this permutation procedure using hist() and add a red vertical line for the original g1 network observed assortativity value that is stored in observed.assortativity.
# Plot the distribution of assortativity values and add a red vertical line at the original observed value
hist(unlist(results))
abline(v = observed.assortativity, col = "red", lty = 3, lwd=2)

Reciprocity

The reciprocity of a directed network reflects the proportion of edges that are symmetrical. That is, the proportion of outgoing edges that also have an incoming edge. It is commonly used to determine how inter-connected directed networks are. An example of a such a network may be grooming exchanges in chimpanzees. Certain chimps may groom another but do not get groomed by that individual, whereas other chimps may both groom each other and so would have a reciprocal tie.

  • In this example network of chimps grooming each other, make an exploratory plot of the network g using plot(). Make the arrow size 0.3 using the argument edge.arrow.size and the arrow width 0.5 using the argument edge.arrow.width.
numbersA="1  6  8  9 11 12  3  5  7 12  
2  4  5 10 13 14  3  6  9 10 13  3  8 
15  1 3  9  8  9 13  5 12 13 14  1  2  
3  4  7 11 12 14  3  6 14 15  4  6  9 
12 3  7  8 12 14  3  4 10 11"

numbersB ="15  1  1  1  1  1  2  2  2  
2  3  3  4  4  4  4  5  5  5  5  5  6  
6  6  7  7  7 15  8  8  9  9  9  9 10 
10 10 10 10 10 10 10 11 11 11 11 12 12
12 15 13 13 13 13 13 14 14 14 14"

A <- stringr::str_extract_all(numbersA, "[\\.0-9e-]+")
B <- stringr::str_extract_all(numbersB, "[\\.0-9e-]+")
el <- cbind(as.numeric(unlist(A)), as.numeric(unlist(B)))
g <- graph_from_edgelist(el, directed = TRUE)
# Make a plot of the chimp grooming network
plot(g,
     edge.color = "black",
     edge.arrow.size = 0.3,
     edge.arrow.width = 0.5)

  • Calculate the reciprocity of the graph using reciprocity().
# Calculate the reciprocity of the graph
reciprocity(g)
## [1] 0.2711864

Assortativity how likely vertices are to connect to others that share some attribute in common.

Fast-greedy community detection

The first community detection method you will try is fast-greedy community detection. You will use the Zachary Karate Club network. This social network contains 34 club members and 78 edges. Each edge indicates that those two club members interacted outside the karate club as well as at the club. Using this network you will determine how many sub-communities the network has and which club members belong to which subgroups. You will also plot the networks by community membership.

  • Use the function fastgreedy.community() to create a community object. Assign this to the object kc.
data(karate, package = "igraphdata")
# Perform fast-greedy community detection on network graph
kc = fastgreedy.community(karate)
  • Use the function sizes() on kc to determine how many communities were detected and how many club members are in each.
# Determine sizes of each community
sizes(kc)
## Community sizes
##  1  2  3 
## 18 11  5

Display which club members are in which community using the function membership().

# Determine which individuals belong to which community
membership(kc)
##    Mr Hi  Actor 2  Actor 3  Actor 4  Actor 5  Actor 6  Actor 7  Actor 8 
##        2        2        2        2        3        3        3        2 
##  Actor 9 Actor 10 Actor 11 Actor 12 Actor 13 Actor 14 Actor 15 Actor 16 
##        1        1        3        2        2        2        1        1 
## Actor 17 Actor 18 Actor 19 Actor 20 Actor 21 Actor 22 Actor 23 Actor 24 
##        3        2        1        2        1        2        1        1 
## Actor 25 Actor 26 Actor 27 Actor 28 Actor 29 Actor 30 Actor 31 Actor 32 
##        1        1        1        1        1        1        1        1 
## Actor 33   John A 
##        1        1

Make the default community plot by using the function plot(). The first argument should be the object kc and the second argument is the graph object g.

# Plot the community structure of the network
plot(kc, karate)

Edge-betweenness community detection

An alternative community detection method is edge-betweenness. In this exercise you will repeat the community detection of the karate club using this method and compare the results visually to the fast-greedy method

  • Use the function edge.betweenness.community() on the graph object g to create the community igraph object gc.
# Perform edge-betweenness community detection on network graph
gc = edge.betweenness.community(karate)
## Warning in edge.betweenness.community(karate): At community.c:
## 460 :Membership vector will be selected based on the lowest modularity
## score.
## Warning in edge.betweenness.community(karate): At community.c:
## 467 :Modularity calculation with weighted edge betweenness community
## detection might not make sense -- modularity treats edge weights as
## similarities while edge betwenness treats them as distances

Calculate the size and number of communities by using the function sizes on the community igraph object.

# Determine sizes of each community
sizes(gc)
## Community sizes
##  1  2  3  4  5  6 
##  9  4  5 10  4  2

Plot each community plot next to each other using par(). The first plot should include the community object kc from the previous exercise. The second plot should include the community object gc.

# Plot community networks determined by fast-greedy and edge-betweenness methods side-by-side
par(mfrow = c(1, 2)) 
plot(kc, karate)
plot(gc, karate)

igraph has many community detection algorithms. How many communities does the algorithm leading.eigenvector.community() believe that the karate club network object g can be assigned to?

4 communities.

plot(leading.eigenvector.community(karate), karate)

Interactive networks with threejs

In this course you have exclusively used igraph to make basic static network plots. There are many packages available to make network plots. One very useful one is threejs which allows you to make interactive network visualizations. This package also integrates seamlessly with igraph. In this exercise you will make a basic interactive network plot of the karate club network using the threejs package. Once you have produced the visualization be sure to move the network around with your mouse. You should be able to scroll in and out of the network as well as rotate the network.

  • First using set_vertex_attr() let’s make a new vertex attribute called color that is dodgerblue.
library(threejs)
# Set a vertex attribute called 'color' to 'dodgerblue' 
g <- set_vertex_attr(karate, "color", value = "dodgerblue")

Plot the network g using the threejs function graphjs(). The first argument should be the graph object g. Also make the vertex size equal to 1.

# Redraw the graph and make the vertex size 1
graphjs(g, vertex.size = 1)

Sizing vertices in threejs

As with all network visualizations it is often worth adjusting the size of vertices to illustrate their relative importance. This is also straightforward in threejs. In this exercise you will create an interactive threejs plot of the karate club network and size vertices based on their relative eigenvector centrality.

  • Calculate the eigenvector centrality of each vertex using eigen_centrality() and store the values in the object ec.
# Create numerical vector of vertex eigenvector centralities 
ec <- as.numeric(eigen_centrality(karate)$vector)
  • Using sqrt() adjust the values in ec to create a new vector of values v which is equal to five times the square root of the original eigenvector centrality.
# Create new vector 'v' that is equal to the square-root of 'ec' multiplied by 5
v <- 5*sqrt(ec)
  • Plot the network using the threejs function graphjs and making the argument vertex.size equal to the values in v.
# Plot threejs plot of graph setting vertex size to v
graphjs(karate, vertex.size = v)

3D community network graph

Finally in this exercise you will create an interactive threejs plot with the vertices based on their community membership as produced by the fast-greedy community detection method.

  • Use the function membership() on the community igraph object kc to generate a vector of community membership for each vertex.
# Create an object 'i' containin the memberships of the fast-greedy community detection
i <-  membership(kc)
  • Check how many communities there are using the function sizes() on the community igraph object kc.
# Check the number of different communities
sizes(kc)
## Community sizes
##  1  2  3 
## 18 11  5
  • Use set_vertex_attr() to add a vertex attribute called color to the graph object g. The values to add are the colors based on the membership assigned to object i.
# Add a color attribute to each vertex, setting the vertex color based on community membership
g <- set_vertex_attr(karate, "color", value = c("yellow", "blue", "red")[i])
  • Plot the three-dimensionsal graph by using the function graphjs() on the network object g.
# Plot the graph using threejs
graphjs(g)
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Canada.1252  LC_CTYPE=English_Canada.1252   
## [3] LC_MONETARY=English_Canada.1252 LC_NUMERIC=C                   
## [5] LC_TIME=English_Canada.1252    
## 
## attached base packages:
## [1] methods   stats     graphics  grDevices utils     datasets  base     
## 
## other attached packages:
## [1] threejs_0.3.1    igraphdata_1.0.1 igraph_1.2.1     downloader_0.4  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.17     knitr_1.20       magrittr_1.5     xtable_1.8-2    
##  [5] lattice_0.20-35  R6_2.2.2         stringr_1.3.1    tools_3.4.4     
##  [9] grid_3.4.4       xfun_0.1         htmltools_0.3.6  crosstalk_1.0.0 
## [13] yaml_2.1.19      rprojroot_1.3-2  digest_0.6.15    bookdown_0.7    
## [17] Matrix_1.2-14    shiny_1.0.5      later_0.7.2      htmlwidgets_1.2 
## [21] promises_1.0.1   base64enc_0.1-3  codetools_0.2-15 mime_0.5        
## [25] evaluate_0.10.1  rmarkdown_1.9    blogdown_0.6     stringi_1.1.7   
## [29] compiler_3.4.4   backports_1.1.2  jsonlite_1.5     httpuv_1.4.3    
## [33] pkgconfig_2.0.1
knitr::write_bib(.packages(), "packages.bib") 

References

Chang, Winston. 2015. Downloader: Download Files over Http and Https. https://CRAN.R-project.org/package=downloader.

R Core Team. 2018. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.