import networkx as nx
import pandas as pd
import numpy as np
import picklePredicting salary and new connections from network data
Note: data from the Coursera course Applied Social Network Analysis in Python
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import randomCompany Emails
Here I analzye a company’s email network where a node corresponds to a person and an edge indicates that at least one email has been sent between two people. The network also contains the node attributes Department and ManagmentSalary. Department indicates the department in the company which the person belongs to, and ManagmentSalary indicates whether that person is receiving a managment position salary.
G = pickle.load(open('data/email_prediction.plk', 'rb'))
print(f"Graph with {len(nx.nodes(G))} nodes and {len(nx.edges(G))} edges")Graph with 1005 nodes and 16706 edges
Salary Prediction
Here I aim at predicting if people whithout a ManagementSalary will receive a managment position salary. To this end, I train a classifier on people that have ManagementSalary and then I predict the probability of receiving a managment salary for those people where ManagementSalary is missing. Models are evaluated with different performance metrics, Area Under the ROC Curve (AUCROC), Area under the Precision Recall Curve (AUPRC) and Balanced Accuracy.
list(G.nodes(data=True))[:5] # print the first 5 nodes[(0, {'Department': 1, 'ManagementSalary': 0.0}),
(1, {'Department': 1, 'ManagementSalary': nan}),
(581, {'Department': 3, 'ManagementSalary': 0.0}),
(6, {'Department': 25, 'ManagementSalary': 1.0}),
(65, {'Department': 4, 'ManagementSalary': nan})]
def build_salary_dataset(G):
df = pd.DataFrame()
df['management_salary'] = nx.get_node_attributes(G, 'ManagementSalary')
df['department'] = nx.get_node_attributes(G, 'Department')
df['clustering'] = nx.clustering(G)
df['betweenness'] = nx.betweenness_centrality(G)
df['closenness'] = nx.closeness_centrality(G)
df['pagerank'] = nx.pagerank(G)
df['hubs'], df['auth'] = nx.hits(G)
df.sort_index(inplace=True)
return df
def get_feature_and_label(df, label='management_salary'):
X = df.drop(columns=[label]) ## df.iloc[:,1:]
y = df[label] ## df.iloc[:,0]
return X,y
class GridDict(dict):
def __init__(self, *args, **kwargs):
super(GridDict, self).__init__(*args, **kwargs)
self.best_params_ = self
def get_best_parameters(X, y, tune=True):
# from sklearn.model_selection import train_test_split
## with random_state=0 same as splitting below
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train, y_train = X.loc[~y.isna(),:], y[~y.isna()]
X_test, y_test = X.loc[y.isna(),:], y[y.isna()]
# Create the pipeline: in this way the scaling is included in each cv, making the model selection more robust
pipeline = Pipeline([('scaler', StandardScaler()), # Step 1: Scale the data
('rfc', RandomForestClassifier(random_state=0)) # Step 2: Train the model
])
## grid search
param_grid = {
'rfc__n_estimators': [50, 100, 200],
'rfc__max_depth': [10, 20, 30],
'rfc__min_samples_split': [2, 5, 10],
'rfc__min_samples_leaf': [1, 2, 4],
'rfc__max_features': ['sqrt', 'log2']
}
if tune:
rfc = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)
return grid_search
else:
## random subsampling grid search to avoid grader's TimeoutError
## to select the first elem: [value[0]]
param_grid = {key: random.choice(value) for key, value in param_grid.items()}
return GridDict(param_grid)## build dataset
df = build_salary_dataset(G)
X, y = get_feature_and_label(df)
# from sklearn.model_selection import train_test_split
## with random_state=0 same as splitting below
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train, y_train = X.loc[~y.isna(),:], y[~y.isna()]
X_test, y_test = X.loc[y.isna(),:], y[y.isna()]def salary_predictions(X_train, y_train, X_test, y_test):
### scale dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
## gest best parameter
best_param = get_best_parameters(X, y, tune=True)
## train model
clf = RandomForestClassifier(max_depth=best_param.best_params_['rfc__max_depth'],
max_features=best_param.best_params_['rfc__max_features'],
min_samples_leaf=best_param.best_params_['rfc__min_samples_leaf'],
min_samples_split=best_param.best_params_['rfc__min_samples_split'],
n_estimators=best_param.best_params_['rfc__n_estimators'],
n_jobs=-1, random_state=0)
clf.fit(X_train_scaled, y_train)
## return proba
y_pred = clf.predict_proba(X_test_scaled)
probs = y_pred[:,1]
return pd.Series(probs, index=X_test.index)
pred = salary_predictions(X_train, y_train, X_test, y_test)
pred1 0.082298
2 0.900215
5 0.990329
8 0.062392
14 0.095849
...
992 0.000000
994 0.001667
996 0.000000
1000 0.010440
1001 0.026925
Length: 252, dtype: float64
Performance evaluation
mask = ~y.isna()
X_known, y_known = X[mask], y[mask]
pipeline = Pipeline([
('scaler', StandardScaler()),
('rfc', RandomForestClassifier(random_state=0))
])
metrics = ['roc_auc', 'average_precision', 'balanced_accuracy']
perfname = ['AUROC', 'AUPRC', 'Balanced Accuracy']
for perf, name in zip(metrics, perfname):
scores = cross_val_score(pipeline, X_known, y_known, cv=5, scoring=perf)
print(f"Averaged {name}: {scores.mean():.3f}")Averaged AUROC: 0.954
Averaged AUPRC: 0.851
Averaged Balanced Accuracy: 0.771
New Connections Prediction
Here I aim at predicting future connections between employees of the network. The future connections information is loaded into the variable future_connections. The index is a tuple indicating a pair of nodes that currently do not have a connection, and the Future Connection column indicates if an edge between those two nodes will exist in the future, where a value of 1.0 indicates a future connection.
future_connections = pd.read_csv('data/future_connections.csv', index_col=0, converters={0: eval})
future_connections| Future Connection | |
|---|---|
| (6, 840) | 0.0 |
| (4, 197) | 0.0 |
| (620, 979) | 0.0 |
| (519, 872) | 0.0 |
| (382, 423) | 0.0 |
| ... | ... |
| (165, 923) | NaN |
| (673, 755) | NaN |
| (939, 940) | NaN |
| (555, 905) | NaN |
| (75, 101) | NaN |
488446 rows × 1 columns
Here I aim at predicting if people will have a future connections. To this end:
- I creat a matrix of features for the edges found in
future_connectionsusing Networkx
- I train a sklearn classifier on those edges in
future_connectionsthat haveFuture Connectiondata
- I predict a probability of the edge being a future connection for those edges in
future_connectionswhereFuture Connectionis missing. - I evaluate the model with different metrics - AUROC, AUPRC and Balaced Accuracy
## build data
def build_connection_dataset(future_connections, G):
df = future_connections.copy() ## create a deep copy of future_connections
df.rename(columns={'Future Connection': 'future_conn'}, inplace=True)
map_edges = lambda ed: dict({(x,y):z for x,y,z in ed})
df['common_neigh'] = map_edges([(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)])
df['jaccard_coef'] = map_edges(nx.jaccard_coefficient(G))
df['alloc_index'] = map_edges(nx.resource_allocation_index(G))
df['adamic_adar'] = map_edges(nx.adamic_adar_index(G))
df['pref_attach'] = map_edges(nx.preferential_attachment(G))
df.sort_index(inplace=True)
return df
def get_feature_and_label(df, label='future_conn'):
X = df.drop(columns=[label]) ## df.iloc[:,1:]
y = df[label] ## df.iloc[:,0]
return X,y
df = build_connection_dataset(future_connections, G)
X, y = get_feature_and_label(df, label='future_conn')
# from sklearn.model_selection import train_test_split
## with random_state=0 same as splitting below
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train, y_train = X.loc[~y.isna(),:], y[~y.isna()]
X_test, y_test = X.loc[y.isna(),:], y[y.isna()]class GridDict(dict):
def __init__(self, *args, **kwargs):
super(GridDict, self).__init__(*args, **kwargs)
self.best_params_ = self
def get_best_parameters(X_train, y_train, tune=True):
# Create the pipeline: in this way the scaling is included in each cv, making the model selection more robust
pipeline = Pipeline([('scaler', StandardScaler()), # Step 1: Scale the data
('rfc', RandomForestClassifier(random_state=0)) # Step 2: Train the model
])
## grid search
param_grid = {
'rfc__n_estimators': [50, 100, 200],
'rfc__max_depth': [10, 20, 30],
'rfc__min_samples_split': [2, 5, 10],
'rfc__min_samples_leaf': [1, 2, 4],
'rfc__max_features': ['sqrt', 'log2']
}
if tune:
rfc = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)
return grid_search
else:
## random subsampling grid search to avoid grader's TimeoutError
## to select the first elem: [value[0]]
param_grid = {key: random.choice(value) for key, value in param_grid.items()}
return GridDict(param_grid)
def new_connections_predictions(X_train, y_train, X_test, y_test, tune=False):
### scale dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
## get best parameters
best_param = get_best_parameters(X, y, tune=tune)
## train model
clf = RandomForestClassifier(max_depth=best_param.best_params_['rfc__max_depth'],
max_features=best_param.best_params_['rfc__max_features'],
min_samples_leaf=best_param.best_params_['rfc__min_samples_leaf'],
min_samples_split=best_param.best_params_['rfc__min_samples_split'],
n_estimators=best_param.best_params_['rfc__n_estimators'],
n_jobs=-1, random_state=0)
clf.fit(X_train_scaled, y_train)
## return proba
y_pred = clf.predict_proba(X_test_scaled)
probs = y_pred[:,1]
return pd.Series(probs, index=X_test.index)
pred = new_connections_predictions(X_train, y_train, X_test, y_test, tune=False)
pred(0, 9) 0.015451
(0, 19) 0.052432
(0, 20) 0.332290
(0, 35) 0.006085
(0, 38) 0.009872
...
(998, 999) 0.014456
(1000, 1002) 0.011865
(1000, 1003) 0.011865
(1000, 1004) 0.011865
(1001, 1002) 0.012798
Length: 122112, dtype: float64
Performance evaluation
mask = ~y.isna()
X_known, y_known = X[mask], y[mask]
pipeline = Pipeline([
('scaler', StandardScaler()),
('rfc', RandomForestClassifier(random_state=0))
])
metrics = ['roc_auc', 'average_precision', 'balanced_accuracy']
perfname = ['AUROC', 'AUPRC', 'Balanced Accuracy']
for perf, name in zip(metrics, perfname):
scores = cross_val_score(pipeline, X_known, y_known, cv=5, scoring=perf)
print(f"Averaged {name}: {scores.mean():.3f}")Averaged AUROC: 0.888
Averaged AUPRC: 0.736
Averaged Balanced Accuracy: 0.786