Predicting salary and new connections from network data

Note

Note: data from the Coursera course Applied Social Network Analysis in Python

import networkx as nx
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import random

Company Emails

Here I analzye a company’s email network where a node corresponds to a person and an edge indicates that at least one email has been sent between two people. The network also contains the node attributes Department and ManagmentSalary. Department indicates the department in the company which the person belongs to, and ManagmentSalary indicates whether that person is receiving a managment position salary.

G = pickle.load(open('data/email_prediction.plk', 'rb'))

print(f"Graph with {len(nx.nodes(G))} nodes and {len(nx.edges(G))} edges")
Graph with 1005 nodes and 16706 edges

Salary Prediction

Here I aim at predicting if people whithout a ManagementSalary will receive a managment position salary. To this end, I train a classifier on people that have ManagementSalary and then I predict the probability of receiving a managment salary for those people where ManagementSalary is missing. Models are evaluated with different performance metrics, Area Under the ROC Curve (AUCROC), Area under the Precision Recall Curve (AUPRC) and Balanced Accuracy.

list(G.nodes(data=True))[:5] # print the first 5 nodes
[(0, {'Department': 1, 'ManagementSalary': 0.0}),
 (1, {'Department': 1, 'ManagementSalary': nan}),
 (581, {'Department': 3, 'ManagementSalary': 0.0}),
 (6, {'Department': 25, 'ManagementSalary': 1.0}),
 (65, {'Department': 4, 'ManagementSalary': nan})]
def build_salary_dataset(G):
    df = pd.DataFrame()
    df['management_salary'] = nx.get_node_attributes(G, 'ManagementSalary')
    df['department'] = nx.get_node_attributes(G, 'Department')
    df['clustering'] = nx.clustering(G)
    df['betweenness'] = nx.betweenness_centrality(G)
    df['closenness'] = nx.closeness_centrality(G)
    df['pagerank'] = nx.pagerank(G)
    df['hubs'], df['auth'] = nx.hits(G)
    df.sort_index(inplace=True)
    return df

def get_feature_and_label(df, label='management_salary'):
    X = df.drop(columns=[label]) ## df.iloc[:,1:]
    y = df[label]                ## df.iloc[:,0]
    return X,y

class GridDict(dict):
    def __init__(self, *args, **kwargs):
        super(GridDict, self).__init__(*args, **kwargs)
        self.best_params_ = self

def get_best_parameters(X, y, tune=True):
    # from sklearn.model_selection import train_test_split
    ## with random_state=0 same as splitting below
    # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    X_train, y_train = X.loc[~y.isna(),:], y[~y.isna()]
    X_test, y_test = X.loc[y.isna(),:], y[y.isna()]

    # Create the pipeline: in this way the scaling is included in each cv, making the model selection more robust
    pipeline = Pipeline([('scaler', StandardScaler()), # Step 1: Scale the data 
                         ('rfc', RandomForestClassifier(random_state=0)) # Step 2: Train the model 
                        ])
    ## grid search
    param_grid = {
        'rfc__n_estimators': [50, 100, 200],
        'rfc__max_depth': [10, 20, 30],
        'rfc__min_samples_split': [2, 5, 10],
        'rfc__min_samples_leaf': [1, 2, 4],
        'rfc__max_features': ['sqrt', 'log2']
    }

    if tune:
        rfc = RandomForestClassifier(random_state=0)
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        return grid_search
    else:
        ## random subsampling grid search to avoid grader's TimeoutError
        ## to select the first elem: [value[0]]
        param_grid = {key: random.choice(value) for key, value in param_grid.items()}
        return GridDict(param_grid)
## build dataset
df = build_salary_dataset(G)
X, y = get_feature_and_label(df)

# from sklearn.model_selection import train_test_split
## with random_state=0 same as splitting below
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train, y_train = X.loc[~y.isna(),:], y[~y.isna()]
X_test, y_test = X.loc[y.isna(),:], y[y.isna()]
def salary_predictions(X_train, y_train, X_test, y_test):
    ### scale dataset
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    ## gest best parameter
    best_param = get_best_parameters(X, y, tune=True)
    
    ## train model
    clf = RandomForestClassifier(max_depth=best_param.best_params_['rfc__max_depth'], 
                                 max_features=best_param.best_params_['rfc__max_features'], 
                                 min_samples_leaf=best_param.best_params_['rfc__min_samples_leaf'], 
                                 min_samples_split=best_param.best_params_['rfc__min_samples_split'], 
                                 n_estimators=best_param.best_params_['rfc__n_estimators'], 
                                 n_jobs=-1, random_state=0)

    clf.fit(X_train_scaled, y_train)
    
    ## return proba
    y_pred = clf.predict_proba(X_test_scaled)
    probs = y_pred[:,1]
    return pd.Series(probs, index=X_test.index)

pred = salary_predictions(X_train, y_train, X_test, y_test)
pred
1       0.082298
2       0.900215
5       0.990329
8       0.062392
14      0.095849
          ...   
992     0.000000
994     0.001667
996     0.000000
1000    0.010440
1001    0.026925
Length: 252, dtype: float64

Performance evaluation

mask = ~y.isna()
X_known, y_known = X[mask], y[mask]

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rfc', RandomForestClassifier(random_state=0))
])

metrics = ['roc_auc', 'average_precision', 'balanced_accuracy']
perfname = ['AUROC', 'AUPRC', 'Balanced Accuracy']

for perf, name in zip(metrics, perfname):
  scores = cross_val_score(pipeline, X_known, y_known, cv=5, scoring=perf)
  print(f"Averaged {name}: {scores.mean():.3f}")
Averaged AUROC: 0.954
Averaged AUPRC: 0.851
Averaged Balanced Accuracy: 0.771

New Connections Prediction

Here I aim at predicting future connections between employees of the network. The future connections information is loaded into the variable future_connections. The index is a tuple indicating a pair of nodes that currently do not have a connection, and the Future Connection column indicates if an edge between those two nodes will exist in the future, where a value of 1.0 indicates a future connection.

future_connections = pd.read_csv('data/future_connections.csv', index_col=0, converters={0: eval})
future_connections
Future Connection
(6, 840) 0.0
(4, 197) 0.0
(620, 979) 0.0
(519, 872) 0.0
(382, 423) 0.0
... ...
(165, 923) NaN
(673, 755) NaN
(939, 940) NaN
(555, 905) NaN
(75, 101) NaN

488446 rows × 1 columns

Here I aim at predicting if people will have a future connections. To this end:

  1. I creat a matrix of features for the edges found in future_connections using Networkx
  2. I train a sklearn classifier on those edges in future_connections that have Future Connection data
  3. I predict a probability of the edge being a future connection for those edges in future_connections where Future Connection is missing.
  4. I evaluate the model with different metrics - AUROC, AUPRC and Balaced Accuracy
## build data
def build_connection_dataset(future_connections, G):
    df = future_connections.copy() ## create a deep copy of future_connections
    df.rename(columns={'Future Connection': 'future_conn'}, inplace=True)
    map_edges = lambda ed: dict({(x,y):z for x,y,z in ed})
    df['common_neigh'] = map_edges([(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)])
    df['jaccard_coef'] = map_edges(nx.jaccard_coefficient(G))
    df['alloc_index']  = map_edges(nx.resource_allocation_index(G))
    df['adamic_adar']  = map_edges(nx.adamic_adar_index(G))
    df['pref_attach']  = map_edges(nx.preferential_attachment(G))
    df.sort_index(inplace=True)
    return df

def get_feature_and_label(df, label='future_conn'):
    X = df.drop(columns=[label]) ## df.iloc[:,1:]
    y = df[label]                ## df.iloc[:,0]
    return X,y

df = build_connection_dataset(future_connections, G)
X, y = get_feature_and_label(df, label='future_conn')

# from sklearn.model_selection import train_test_split
## with random_state=0 same as splitting below
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train, y_train = X.loc[~y.isna(),:], y[~y.isna()]
X_test, y_test = X.loc[y.isna(),:], y[y.isna()]
class GridDict(dict):
    def __init__(self, *args, **kwargs):
        super(GridDict, self).__init__(*args, **kwargs)
        self.best_params_ = self

def get_best_parameters(X_train, y_train, tune=True):
    # Create the pipeline: in this way the scaling is included in each cv, making the model selection more robust
    pipeline = Pipeline([('scaler', StandardScaler()), # Step 1: Scale the data 
                         ('rfc', RandomForestClassifier(random_state=0)) # Step 2: Train the model 
                        ])
    ## grid search
    param_grid = {
        'rfc__n_estimators': [50, 100, 200],
        'rfc__max_depth': [10, 20, 30],
        'rfc__min_samples_split': [2, 5, 10],
        'rfc__min_samples_leaf': [1, 2, 4],
        'rfc__max_features': ['sqrt', 'log2']
    }
    
    if tune:
        rfc = RandomForestClassifier(random_state=0)
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        return grid_search
    else:
        ## random subsampling grid search to avoid grader's TimeoutError
        ## to select the first elem: [value[0]]
        param_grid = {key: random.choice(value) for key, value in param_grid.items()}
        return GridDict(param_grid)
    
def new_connections_predictions(X_train, y_train, X_test, y_test, tune=False):
    ### scale dataset
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    ## get best parameters
    best_param = get_best_parameters(X, y, tune=tune)

    ## train model
    clf = RandomForestClassifier(max_depth=best_param.best_params_['rfc__max_depth'], 
                                    max_features=best_param.best_params_['rfc__max_features'], 
                                    min_samples_leaf=best_param.best_params_['rfc__min_samples_leaf'], 
                                    min_samples_split=best_param.best_params_['rfc__min_samples_split'], 
                                    n_estimators=best_param.best_params_['rfc__n_estimators'], 
                                    n_jobs=-1, random_state=0)

    clf.fit(X_train_scaled, y_train)

    ## return proba
    y_pred = clf.predict_proba(X_test_scaled)
    probs = y_pred[:,1]
    return pd.Series(probs, index=X_test.index)

pred = new_connections_predictions(X_train, y_train, X_test, y_test, tune=False)
pred
(0, 9)          0.015451
(0, 19)         0.052432
(0, 20)         0.332290
(0, 35)         0.006085
(0, 38)         0.009872
                  ...   
(998, 999)      0.014456
(1000, 1002)    0.011865
(1000, 1003)    0.011865
(1000, 1004)    0.011865
(1001, 1002)    0.012798
Length: 122112, dtype: float64

Performance evaluation

mask = ~y.isna()
X_known, y_known = X[mask], y[mask]

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rfc', RandomForestClassifier(random_state=0))
])

metrics = ['roc_auc', 'average_precision', 'balanced_accuracy']
perfname = ['AUROC', 'AUPRC', 'Balanced Accuracy']

for perf, name in zip(metrics, perfname):
  scores = cross_val_score(pipeline, X_known, y_known, cv=5, scoring=perf)
  print(f"Averaged {name}: {scores.mean():.3f}")
Averaged AUROC: 0.888
Averaged AUPRC: 0.736
Averaged Balanced Accuracy: 0.786