def sim_score(x:pd.Series, uid:pd.Series, t1=None, t2=None):
"""
Notes
-----
The problem with the current implementation is that we calculate all pairs of
distances and then we prune to keep only the neighbours for each instance.
A better approach would be to only generate neighbours as needed
"""
# Check for type of variable
# The method to check for category type was obtained from
# https://pandas.pydata.org/docs/dev/user_guide/categorical.html#gotchas
if hasattr(x, "cat"):
# Check if categorical is ordered
if x.cat.ordered:
= _sim_ord_score(x, uid)
res else:
= _sim_nom_score(x, uid)
res else:
= _sim_cont_score(x, uid, t1, t2)
res return res
Exploratory data analysis. Neighbours graph
networks
eda
Introduction
We want to study the heterogeneity of the sample and how is the target distributed along the samples. To do so we will first identify groups of observations that are close to each other (small cluster) and then we will represent this on a graph.
We will define a hierarchical? similarity metric where first we define similarity metrics across each variable and then a higher order? metric which is simply the sum of the similarities for each metric.
The similarity metric for each variable will depend on the type of the variable:
- Binary or nominal: The similarity will be 1 if they agree on value and 0 otherwise.
- Ordinal: The similarity will be 1 if they share the same value, 0.5 if they differ in only one level and 0 otherwise.
- Continuous: Define two thresholds \(t_{1}\) and \(t_{2}\) with \(t_{1} < t_{2}\). Calculate the distance between two instances \(d_{i,j}\). Then, the similarity \(s_{i,j}\) is, \[ s_{i,j} = \begin{cases} 1 & d_{i,j \leq t_{1}} \\ 0.5 & t_{1} < d_{i,j \leq t_{2}} \\ 0 & \text{otherwise} \end{cases} \]
Implementation of similarity
def _sim_cont_score(x:pd.Series, uid:pd.Series, t1:float, t2:float) -> pd.DataFrame:
"""Compute similarity score for a continuous variable
Parameters
----------
x :
Observations of the variable
uid:
Unique identifier of instance
t1 :
1-similarity threshold
t2 :
0.5-similarity threshold
It assumes missing values are encoded as NaN and that `t1` < `t2`
Similarity is defined based on thresholds,
If dist(x_{1}, x_{2}) <= `t1` -> similarity = 1,
else if `t1`<= dist(x_{1}, x_{2}) <= `t2` -> similarity = 0.5
otherwise similarity = 0
Notes
-----
We need to consider the following cases:
1. Both observations are real. In this case we calculate a distance
and the similarity is based on the thresholds
2. One observation is real and the other is missing (NaN). The distance is
NaN and the similarity is 0
3. Both obsrvations are missing. The distance is NaN and the similarity
will depend. We set to 1 at the beginning
"""
# Convert into nupy array with appropriate dimension
= np.expand_dims(x.values, axis=1)
x_arr
# Get indices of upper triangular matrix (w/o main diagonal)
# This is so we can map the output of pdist to the appropriate instances
= np.triu_indices(len(x_arr), k=1)
src, dest
# Calculate pairwise distances
= spsd.pdist(x_arr)
dists
# Calculate similarity
= [dists <= t1, dists <= t2]
condlist = [1, 0.5]
choicelist = np.select(condlist, choicelist)
sims
# Define output dataframe
= pd.DataFrame({
res 'src':src,
'dest':dest,
'src_obs': [x_arr[idx][0] for idx in src],
'dest_obs': [x_arr[idx][0] for idx in dest],
'dist':dists,
'sim':sims})
# Compute similarity for missing observations
= res.src_obs.isna()
miss_1 = res.dest_obs.isna()
miss_2 # Case 1: One of the observations in missing
| miss_2, ['dist', 'sim']] = [np.nan, 0]
res.loc[miss_1 # Case 2: Both observations are missing
& miss_2, ['dist', 'sim']] = [np.nan, 1]
res.loc[miss_1
return res.loc[res.sim > 0, ['src', 'dest', 'sim']]
def _sim_nom_score(x:pd.Series, uid:pd.Series):
"""
Parameters
----------
x :
Observations of the variable
uid:
Unique identifier of instance
It assumes missing values are encoded as -1 (standard code for category variables)
"""
# Convert into nupy array with appropriate dimension
= np.expand_dims(x.cat.codes, axis=1)
x_arr
# Get indices of upper triangular matrix (w/o main diagonal)
# This is so we can map the output of pdist to the appropriate instances
= np.triu_indices(len(x_arr), k=1)
src, dest
# Calculate pairwise distances
= spsd.pdist(x_arr)
dists
# Calculate similarity
= np.where(dists == 0, 1, 0)
sims
# Define output dataframe
= pd.DataFrame({
res 'src':[uid[idx] for idx in src],
'dest':[uid[idx] for idx in dest],
'src_obs': [x_arr[idx][0] for idx in src],
'dest_obs': [x_arr[idx][0] for idx in dest],
'dist':dists,
'sim':sims})
# Compute similarity for missing observations
= res.src_obs == -1
miss_1 = res.dest_obs == -1
miss_2 # Case 1: One of the observations in missing
| miss_2, ['dist', 'sim']] = [np.nan, 0]
res.loc[miss_1 # Case 2: Both observations are missing
& miss_2, ['dist', 'sim']] = [np.nan, 1]
res.loc[miss_1
return res.loc[res.sim > 0, ['src', 'dest', 'sim']]
def _sim_ord_score(x:pd.Series, uid:pd.Series):
"""
Notes
-----
NaN values are coded as -1 in categorical data types.
The similarity between a instance with NaN and an instance without NaN should be 0
but the similarity between two instances with NaN should be 1
"""
# Case 1: Even though is ordinal we consider each category is close only to itself
= _sim_nom_score(x, uid)
res
return res
# Simple example dataframe
= pd.DataFrame({
simple_test 'cont_x':[1, np.nan, 3, 7, np.nan],
'bin_x':pd.Categorical([np.nan, 'male', 'female', 'male', np.nan], ordered=False)})
simple_test.head()
cont_x | bin_x | |
---|---|---|
0 | 1.0 | NaN |
1 | NaN | male |
2 | 3.0 | female |
3 | 7.0 | male |
4 | NaN | NaN |
# Calculate the similarity for the instances on this feature
= sim_score(simple_test.cont_x, simple_test.index, 3, 5)
sim sim
src | dest | sim | |
---|---|---|---|
1 | 0 | 2 | 1.0 |
6 | 1 | 4 | 1.0 |
7 | 2 | 3 | 0.5 |
# Calculate the similarity for the instances on this feature
= sim_score(simple_test.bin_x, simple_test.index)
sim sim
src | dest | sim | |
---|---|---|---|
3 | 0 | 4 | 1 |
5 | 1 | 3 | 1 |
Example: Titanic dataset
The first observations of the titanic dataset are shown below,
raw_df.head()
passengerid | survived | pclass | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
raw_df.dtypes
passengerid int64
survived int64
pclass category
name object
sex category
age float64
sibsp int64
parch int64
ticket object
fare float64
cabin object
embarked object
dtype: object
We can classify each of the features according to both their usage in a model and their type
= ['passengerid']
index_cols = ['survived']
tgt_col = ['sex']
bin_cols = ['name', 'ticket', 'cabin', 'embarked']
nom_cols = ['pclass', 'sibsp', 'parch']
ord_cols = ['age', 'fare'] rat_cols
Sex
# Find neighbours based on sex
= sim_score(raw_df.sex, raw_df.passengerid)
sex_nbh = sex_nbh.rename(columns={'sim':'weight'}) sex_nbh
Passenger class
# Find neighbours based on passenger class
= sim_score(raw_df.pclass, raw_df.passengerid)
pclass_nbh = pclass_nbh.rename(columns={'sim':'weight'}) pclass_nbh
# Find neighbours based on passenger class
#nbh_cols = ['pclass']
#pclass_nbh = raw_df[index_cols + nbh_cols].copy()
#pclass_nbh = (pclass_nbh.merge(pclass_nbh, on='pclass', how='left')
# .rename(columns={'passengerid_x': 'src', 'passengerid_y': 'dest'}))
#pclass_nbh = pclass_nbh.loc[pclass_nbh.src < pclass_nbh.dest, ['src', 'dest']]
#pclass_nbh['weight'] = 1
Sibsp
'sibsp'].value_counts().to_frame().reset_index() raw_df[
sibsp | count | |
---|---|---|
0 | 0 | 608 |
1 | 1 | 209 |
2 | 2 | 28 |
3 | 4 | 18 |
4 | 3 | 16 |
5 | 8 | 7 |
6 | 5 | 5 |
We decide to group as follows:
- 0: 0
- 1: 1
- 2: 2, 3
- 3: 2, 3, 4
- 4: >= 4
= ['sibsp']
nbh_cols = raw_df[index_cols + nbh_cols].copy()
sibsp_nbh
= sibsp_nbh[sibsp_nbh.sibsp == 2]
sibsp_l2 = sibsp_nbh[sibsp_nbh.sibsp == 3]
sibsp_l3 = sibsp_nbh[sibsp_nbh.sibsp == 4]
sibsp_l4 = sibsp_nbh[sibsp_nbh.sibsp >= 4]
sibsp_l4p
# Pairs where sibsp value is the same 0-0, 1-1, 2-2, ... and so on
= (sibsp_nbh.merge(sibsp_nbh, on='sibsp', how='left')
sibsp_nbh ={'passengerid_x': 'src', 'passengerid_y': 'dest'}))
.rename(columns= (sibsp_l2[['passengerid']].merge(sibsp_l3[['passengerid']], how='cross')
sibsp_nbh_23 ={'passengerid_x': 'src', 'passengerid_y': 'dest'}))
.rename(columns
sibsp_nbh_23.values.sort()
# Remove duplicated
= sibsp_nbh.loc[sibsp_nbh.src < sibsp_nbh.dest, ['src', 'dest']] sibsp_nbh
Generate neighbours overall edges
# Add neighbours edges between every
= pd.concat([sex_nbh, pclass_nbh])
nbh_edges = nbh_edges.groupby(['src', 'dest']).sum().reset_index()
nbh_edges
# In our first analysis, since we calculated edges over two variables we will say two instances are neighbours if they were close over each of the two variables
= 2
n_feats = nbh_edges.loc[nbh_edges.weight == n_feats] nbh_edges
# Create the graph
= list(raw_df.passengerid.unique())
nbh_nodes = ['red' if surv == 1 else 'blue' for surv in raw_df.survived]
nd_color
= nx.Graph()
nbh
nbh.add_nodes_from(nbh_nodes)for src,dest in zip(nbh_edges.src, nbh_edges.dest)]) nbh.add_edges_from([(src, dest)
Compute connected components
= nx.connected_components(nbh)
conn_comp = list()
dfs = 0
idx for c in conn_comp:
={'passengerid':list(c), 'conn_comp_idx':idx}))
dfs.append(pd.DataFrame(data+= 1
idx = pd.concat(dfs).sort_values('passengerid')
nbh_tgt = nbh_tgt.merge(raw_df[index_cols + tgt_col], on='passengerid', how='left') nbh_tgt
'conn_comp_idx').agg({'survived': ['count', 'sum', 'mean']}) nbh_tgt.groupby(
survived | |||
---|---|---|---|
count | sum | mean | |
conn_comp_idx | |||
0 | 347 | 47 | 0.135447 |
1 | 94 | 91 | 0.968085 |
2 | 144 | 72 | 0.500000 |
3 | 122 | 45 | 0.368852 |
4 | 76 | 70 | 0.921053 |
5 | 108 | 17 | 0.157407 |
Visualization
# Create the graph visualization
= {"node_size": 10, "with_labels": False, "width": 0.15}
plot_options = nx.spring_layout(nbh, iterations=15, seed=1721)
pos = plt.subplots(figsize=(15, 9))
fig, ax "off")
ax.axis(=pos, ax=ax, **plot_options) nx.draw_networkx(nbh, pos