import networkx as nx
import pandas as pd
import numpy as np
import pickle
Predicting salary and new connections from network data
Note: data from the Coursera course Applied Social Network Analysis in Python
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import random
Company Emails
Here I analzye a company’s email network where a node corresponds to a person and an edge indicates that at least one email has been sent between two people. The network also contains the node attributes Department
and ManagmentSalary
. Department
indicates the department in the company which the person belongs to, and ManagmentSalary
indicates whether that person is receiving a managment position salary.
= pickle.load(open('data/email_prediction.plk', 'rb'))
G
print(f"Graph with {len(nx.nodes(G))} nodes and {len(nx.edges(G))} edges")
Graph with 1005 nodes and 16706 edges
Salary Prediction
Here I aim at predicting if people whithout a ManagementSalary
will receive a managment position salary. To this end, I train a classifier on people that have ManagementSalary
and then I predict the probability of receiving a managment salary for those people where ManagementSalary
is missing. Models are evaluated with different performance metrics, Area Under the ROC Curve (AUCROC), Area under the Precision Recall Curve (AUPRC) and Balanced Accuracy.
list(G.nodes(data=True))[:5] # print the first 5 nodes
[(0, {'Department': 1, 'ManagementSalary': 0.0}),
(1, {'Department': 1, 'ManagementSalary': nan}),
(581, {'Department': 3, 'ManagementSalary': 0.0}),
(6, {'Department': 25, 'ManagementSalary': 1.0}),
(65, {'Department': 4, 'ManagementSalary': nan})]
def build_salary_dataset(G):
= pd.DataFrame()
df 'management_salary'] = nx.get_node_attributes(G, 'ManagementSalary')
df['department'] = nx.get_node_attributes(G, 'Department')
df['clustering'] = nx.clustering(G)
df['betweenness'] = nx.betweenness_centrality(G)
df['closenness'] = nx.closeness_centrality(G)
df['pagerank'] = nx.pagerank(G)
df['hubs'], df['auth'] = nx.hits(G)
df[=True)
df.sort_index(inplacereturn df
def get_feature_and_label(df, label='management_salary'):
= df.drop(columns=[label]) ## df.iloc[:,1:]
X = df[label] ## df.iloc[:,0]
y return X,y
class GridDict(dict):
def __init__(self, *args, **kwargs):
super(GridDict, self).__init__(*args, **kwargs)
self.best_params_ = self
def get_best_parameters(X, y, tune=True):
# from sklearn.model_selection import train_test_split
## with random_state=0 same as splitting below
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
= X.loc[~y.isna(),:], y[~y.isna()]
X_train, y_train = X.loc[y.isna(),:], y[y.isna()]
X_test, y_test
# Create the pipeline: in this way the scaling is included in each cv, making the model selection more robust
= Pipeline([('scaler', StandardScaler()), # Step 1: Scale the data
pipeline 'rfc', RandomForestClassifier(random_state=0)) # Step 2: Train the model
(
])## grid search
= {
param_grid 'rfc__n_estimators': [50, 100, 200],
'rfc__max_depth': [10, 20, 30],
'rfc__min_samples_split': [2, 5, 10],
'rfc__min_samples_leaf': [1, 2, 4],
'rfc__max_features': ['sqrt', 'log2']
}
if tune:
= RandomForestClassifier(random_state=0)
rfc = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search
grid_search.fit(X_train, y_train)return grid_search
else:
## random subsampling grid search to avoid grader's TimeoutError
## to select the first elem: [value[0]]
= {key: random.choice(value) for key, value in param_grid.items()}
param_grid return GridDict(param_grid)
## build dataset
= build_salary_dataset(G)
df = get_feature_and_label(df)
X, y
# from sklearn.model_selection import train_test_split
## with random_state=0 same as splitting below
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
= X.loc[~y.isna(),:], y[~y.isna()]
X_train, y_train = X.loc[y.isna(),:], y[y.isna()] X_test, y_test
def salary_predictions(X_train, y_train, X_test, y_test):
### scale dataset
= StandardScaler()
scaler = scaler.fit_transform(X_train)
X_train_scaled = scaler.transform(X_test)
X_test_scaled
## gest best parameter
= get_best_parameters(X, y, tune=True)
best_param
## train model
= RandomForestClassifier(max_depth=best_param.best_params_['rfc__max_depth'],
clf =best_param.best_params_['rfc__max_features'],
max_features=best_param.best_params_['rfc__min_samples_leaf'],
min_samples_leaf=best_param.best_params_['rfc__min_samples_split'],
min_samples_split=best_param.best_params_['rfc__n_estimators'],
n_estimators=-1, random_state=0)
n_jobs
clf.fit(X_train_scaled, y_train)
## return proba
= clf.predict_proba(X_test_scaled)
y_pred = y_pred[:,1]
probs return pd.Series(probs, index=X_test.index)
= salary_predictions(X_train, y_train, X_test, y_test)
pred pred
1 0.082298
2 0.900215
5 0.990329
8 0.062392
14 0.095849
...
992 0.000000
994 0.001667
996 0.000000
1000 0.010440
1001 0.026925
Length: 252, dtype: float64
Performance evaluation
= ~y.isna()
mask = X[mask], y[mask]
X_known, y_known
= Pipeline([
pipeline 'scaler', StandardScaler()),
('rfc', RandomForestClassifier(random_state=0))
(
])
= ['roc_auc', 'average_precision', 'balanced_accuracy']
metrics = ['AUROC', 'AUPRC', 'Balanced Accuracy']
perfname
for perf, name in zip(metrics, perfname):
= cross_val_score(pipeline, X_known, y_known, cv=5, scoring=perf)
scores print(f"Averaged {name}: {scores.mean():.3f}")
Averaged AUROC: 0.954
Averaged AUPRC: 0.851
Averaged Balanced Accuracy: 0.771
New Connections Prediction
Here I aim at predicting future connections between employees of the network. The future connections information is loaded into the variable future_connections
. The index is a tuple indicating a pair of nodes that currently do not have a connection, and the Future Connection
column indicates if an edge between those two nodes will exist in the future, where a value of 1.0 indicates a future connection.
= pd.read_csv('data/future_connections.csv', index_col=0, converters={0: eval})
future_connections future_connections
Future Connection | |
---|---|
(6, 840) | 0.0 |
(4, 197) | 0.0 |
(620, 979) | 0.0 |
(519, 872) | 0.0 |
(382, 423) | 0.0 |
... | ... |
(165, 923) | NaN |
(673, 755) | NaN |
(939, 940) | NaN |
(555, 905) | NaN |
(75, 101) | NaN |
488446 rows × 1 columns
Here I aim at predicting if people will have a future connections. To this end:
- I creat a matrix of features for the edges found in
future_connections
using Networkx
- I train a sklearn classifier on those edges in
future_connections
that haveFuture Connection
data
- I predict a probability of the edge being a future connection for those edges in
future_connections
whereFuture Connection
is missing. - I evaluate the model with different metrics - AUROC, AUPRC and Balaced Accuracy
## build data
def build_connection_dataset(future_connections, G):
= future_connections.copy() ## create a deep copy of future_connections
df ={'Future Connection': 'future_conn'}, inplace=True)
df.rename(columns= lambda ed: dict({(x,y):z for x,y,z in ed})
map_edges 'common_neigh'] = map_edges([(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)])
df['jaccard_coef'] = map_edges(nx.jaccard_coefficient(G))
df['alloc_index'] = map_edges(nx.resource_allocation_index(G))
df['adamic_adar'] = map_edges(nx.adamic_adar_index(G))
df['pref_attach'] = map_edges(nx.preferential_attachment(G))
df[=True)
df.sort_index(inplacereturn df
def get_feature_and_label(df, label='future_conn'):
= df.drop(columns=[label]) ## df.iloc[:,1:]
X = df[label] ## df.iloc[:,0]
y return X,y
= build_connection_dataset(future_connections, G)
df = get_feature_and_label(df, label='future_conn')
X, y
# from sklearn.model_selection import train_test_split
## with random_state=0 same as splitting below
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
= X.loc[~y.isna(),:], y[~y.isna()]
X_train, y_train = X.loc[y.isna(),:], y[y.isna()] X_test, y_test
class GridDict(dict):
def __init__(self, *args, **kwargs):
super(GridDict, self).__init__(*args, **kwargs)
self.best_params_ = self
def get_best_parameters(X_train, y_train, tune=True):
# Create the pipeline: in this way the scaling is included in each cv, making the model selection more robust
= Pipeline([('scaler', StandardScaler()), # Step 1: Scale the data
pipeline 'rfc', RandomForestClassifier(random_state=0)) # Step 2: Train the model
(
])## grid search
= {
param_grid 'rfc__n_estimators': [50, 100, 200],
'rfc__max_depth': [10, 20, 30],
'rfc__min_samples_split': [2, 5, 10],
'rfc__min_samples_leaf': [1, 2, 4],
'rfc__max_features': ['sqrt', 'log2']
}
if tune:
= RandomForestClassifier(random_state=0)
rfc = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search
grid_search.fit(X_train, y_train)return grid_search
else:
## random subsampling grid search to avoid grader's TimeoutError
## to select the first elem: [value[0]]
= {key: random.choice(value) for key, value in param_grid.items()}
param_grid return GridDict(param_grid)
def new_connections_predictions(X_train, y_train, X_test, y_test, tune=False):
### scale dataset
= StandardScaler()
scaler = scaler.fit_transform(X_train)
X_train_scaled = scaler.transform(X_test)
X_test_scaled
## get best parameters
= get_best_parameters(X, y, tune=tune)
best_param
## train model
= RandomForestClassifier(max_depth=best_param.best_params_['rfc__max_depth'],
clf =best_param.best_params_['rfc__max_features'],
max_features=best_param.best_params_['rfc__min_samples_leaf'],
min_samples_leaf=best_param.best_params_['rfc__min_samples_split'],
min_samples_split=best_param.best_params_['rfc__n_estimators'],
n_estimators=-1, random_state=0)
n_jobs
clf.fit(X_train_scaled, y_train)
## return proba
= clf.predict_proba(X_test_scaled)
y_pred = y_pred[:,1]
probs return pd.Series(probs, index=X_test.index)
= new_connections_predictions(X_train, y_train, X_test, y_test, tune=False)
pred pred
(0, 9) 0.015451
(0, 19) 0.052432
(0, 20) 0.332290
(0, 35) 0.006085
(0, 38) 0.009872
...
(998, 999) 0.014456
(1000, 1002) 0.011865
(1000, 1003) 0.011865
(1000, 1004) 0.011865
(1001, 1002) 0.012798
Length: 122112, dtype: float64
Performance evaluation
= ~y.isna()
mask = X[mask], y[mask]
X_known, y_known
= Pipeline([
pipeline 'scaler', StandardScaler()),
('rfc', RandomForestClassifier(random_state=0))
(
])
= ['roc_auc', 'average_precision', 'balanced_accuracy']
metrics = ['AUROC', 'AUPRC', 'Balanced Accuracy']
perfname
for perf, name in zip(metrics, perfname):
= cross_val_score(pipeline, X_known, y_known, cv=5, scoring=perf)
scores print(f"Averaged {name}: {scores.mean():.3f}")
Averaged AUROC: 0.888
Averaged AUPRC: 0.736
Averaged Balanced Accuracy: 0.786