def sim_score(x:pd.Series, uid:pd.Series, t1=None, t2=None):
    """
    
    Notes
    -----
    The problem with the current implementation is that we calculate all pairs of
    distances and then we prune to keep only the neighbours for each instance.
    A better approach would be to only generate neighbours as needed
    """
    # Check for type of variable
    # The method to check for category type was obtained from
    # https://pandas.pydata.org/docs/dev/user_guide/categorical.html#gotchas
    if hasattr(x, "cat"):
        # Check if categorical is ordered
        if x.cat.ordered:
            res = _sim_ord_score(x, uid)
        else:
            res = _sim_nom_score(x, uid)
    else:
        res = _sim_cont_score(x, uid, t1, t2)
    return resExploratory data analysis. Neighbours graph
networks
    eda
  Introduction
We want to study the heterogeneity of the sample and how is the target distributed along the samples. To do so we will first identify groups of observations that are close to each other (small cluster) and then we will represent this on a graph.
We will define a hierarchical? similarity metric where first we define similarity metrics across each variable and then a higher order? metric which is simply the sum of the similarities for each metric.
The similarity metric for each variable will depend on the type of the variable:
- Binary or nominal: The similarity will be 1 if they agree on value and 0 otherwise.
 - Ordinal: The similarity will be 1 if they share the same value, 0.5 if they differ in only one level and 0 otherwise.
 - Continuous: Define two thresholds \(t_{1}\) and \(t_{2}\) with \(t_{1} < t_{2}\). Calculate the distance between two instances \(d_{i,j}\). Then, the similarity \(s_{i,j}\) is, \[ s_{i,j} = \begin{cases} 1 & d_{i,j \leq t_{1}} \\ 0.5 & t_{1} < d_{i,j \leq t_{2}} \\ 0 & \text{otherwise} \end{cases} \]
 
Implementation of similarity
def _sim_cont_score(x:pd.Series, uid:pd.Series, t1:float, t2:float) -> pd.DataFrame:
    """Compute similarity score for a continuous variable
    Parameters
    ----------
    x :
        Observations of the variable
    uid:
        Unique identifier of instance
    t1 :
        1-similarity threshold
    t2 :
        0.5-similarity threshold
    It assumes missing values are encoded as NaN and that `t1` < `t2`
    Similarity is defined based on thresholds,
        If dist(x_{1},  x_{2}) <= `t1` -> similarity = 1,
        else if `t1`<= dist(x_{1},  x_{2}) <= `t2` -> similarity = 0.5
        otherwise similarity = 0
    
    Notes
    -----
    We need to consider the following cases:
    1. Both observations are real. In this case we calculate a distance 
       and the similarity is based on the thresholds
    2. One observation is real and the other is missing (NaN). The distance is
       NaN and the similarity is 0
    3. Both obsrvations are missing. The distance is NaN and the similarity 
       will depend. We set to 1 at the beginning
    """
    # Convert into nupy array with appropriate dimension
    x_arr = np.expand_dims(x.values, axis=1) 
    # Get indices of upper triangular matrix (w/o main diagonal)
    # This is so we can map the output of pdist to the appropriate instances
    src, dest = np.triu_indices(len(x_arr), k=1)
    # Calculate pairwise distances
    dists = spsd.pdist(x_arr)
    # Calculate similarity
    condlist = [dists <= t1, dists <= t2]
    choicelist = [1, 0.5]
    sims = np.select(condlist, choicelist)
    # Define output dataframe
    res =  pd.DataFrame({
        'src':src, 
        'dest':dest,
        'src_obs': [x_arr[idx][0] for idx in src],
        'dest_obs': [x_arr[idx][0] for idx in dest],
        'dist':dists, 
        'sim':sims})
    
    # Compute similarity for missing observations
    miss_1 = res.src_obs.isna()
    miss_2 = res.dest_obs.isna()
    # Case 1: One of the observations in missing
    res.loc[miss_1 | miss_2, ['dist', 'sim']] = [np.nan, 0]
    # Case 2: Both observations are missing
    res.loc[miss_1 & miss_2, ['dist', 'sim']] = [np.nan, 1]
    return res.loc[res.sim > 0, ['src', 'dest', 'sim']]def _sim_nom_score(x:pd.Series, uid:pd.Series):
    """
    Parameters
    ----------
    x :
        Observations of the variable
    uid:
        Unique identifier of instance
    It assumes missing values are encoded as -1 (standard code for category variables) 
    """
    # Convert into nupy array with appropriate dimension
    x_arr = np.expand_dims(x.cat.codes, axis=1)
    # Get indices of upper triangular matrix (w/o main diagonal)
    # This is so we can map the output of pdist to the appropriate instances
    src, dest = np.triu_indices(len(x_arr), k=1)
    # Calculate pairwise distances
    dists = spsd.pdist(x_arr)
    # Calculate similarity
    sims = np.where(dists == 0, 1, 0)
    
    # Define output dataframe
    res =  pd.DataFrame({
        'src':[uid[idx] for idx in src], 
        'dest':[uid[idx] for idx in dest],
        'src_obs': [x_arr[idx][0] for idx in src],
        'dest_obs': [x_arr[idx][0] for idx in dest],
        'dist':dists, 
        'sim':sims})
    
    # Compute similarity for missing observations
    miss_1 = res.src_obs == -1
    miss_2 = res.dest_obs == -1
    # Case 1: One of the observations in missing
    res.loc[miss_1 | miss_2, ['dist', 'sim']] = [np.nan, 0]
    # Case 2: Both observations are missing
    res.loc[miss_1 & miss_2, ['dist', 'sim']] = [np.nan, 1]
    return res.loc[res.sim > 0, ['src', 'dest', 'sim']]def _sim_ord_score(x:pd.Series, uid:pd.Series):
    """
    Notes
    -----
    NaN values are coded as -1 in categorical data types. 
    The similarity between a instance with NaN and an instance without NaN should be 0 
    but the similarity between two instances with NaN should be 1
    """
    # Case 1: Even though is ordinal we consider each category is close only to itself
    res = _sim_nom_score(x, uid)
    return res# Simple example dataframe
simple_test = pd.DataFrame({
    'cont_x':[1, np.nan, 3, 7, np.nan], 
    'bin_x':pd.Categorical([np.nan, 'male', 'female', 'male', np.nan], ordered=False)})
simple_test.head()| cont_x | bin_x | |
|---|---|---|
| 0 | 1.0 | NaN | 
| 1 | NaN | male | 
| 2 | 3.0 | female | 
| 3 | 7.0 | male | 
| 4 | NaN | NaN | 
# Calculate the similarity for the instances on this feature
sim = sim_score(simple_test.cont_x, simple_test.index, 3, 5)
sim| src | dest | sim | |
|---|---|---|---|
| 1 | 0 | 2 | 1.0 | 
| 6 | 1 | 4 | 1.0 | 
| 7 | 2 | 3 | 0.5 | 
# Calculate the similarity for the instances on this feature
sim = sim_score(simple_test.bin_x, simple_test.index)
sim| src | dest | sim | |
|---|---|---|---|
| 3 | 0 | 4 | 1 | 
| 5 | 1 | 3 | 1 | 
Example: Titanic dataset
The first observations of the titanic dataset are shown below,
raw_df.head()| passengerid | survived | pclass | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 
raw_df.dtypespassengerid       int64
survived          int64
pclass         category
name             object
sex            category
age             float64
sibsp             int64
parch             int64
ticket           object
fare            float64
cabin            object
embarked         object
dtype: object
We can classify each of the features according to both their usage in a model and their type
index_cols = ['passengerid']
tgt_col = ['survived']
bin_cols = ['sex']
nom_cols = ['name', 'ticket', 'cabin', 'embarked']
ord_cols = ['pclass', 'sibsp', 'parch']
rat_cols = ['age', 'fare']Sex
# Find neighbours based on sex
sex_nbh = sim_score(raw_df.sex, raw_df.passengerid)
sex_nbh = sex_nbh.rename(columns={'sim':'weight'})Passenger class
# Find neighbours based on passenger class
pclass_nbh = sim_score(raw_df.pclass, raw_df.passengerid)
pclass_nbh = pclass_nbh.rename(columns={'sim':'weight'})# Find neighbours based on passenger class
#nbh_cols = ['pclass']
#pclass_nbh = raw_df[index_cols + nbh_cols].copy()
#pclass_nbh = (pclass_nbh.merge(pclass_nbh, on='pclass', how='left')
#              .rename(columns={'passengerid_x': 'src', 'passengerid_y': 'dest'}))
#pclass_nbh = pclass_nbh.loc[pclass_nbh.src < pclass_nbh.dest, ['src', 'dest']]
#pclass_nbh['weight'] = 1Sibsp
raw_df['sibsp'].value_counts().to_frame().reset_index()| sibsp | count | |
|---|---|---|
| 0 | 0 | 608 | 
| 1 | 1 | 209 | 
| 2 | 2 | 28 | 
| 3 | 4 | 18 | 
| 4 | 3 | 16 | 
| 5 | 8 | 7 | 
| 6 | 5 | 5 | 
We decide to group as follows:
- 0: 0
 - 1: 1
 - 2: 2, 3
 - 3: 2, 3, 4
 - 4: >= 4
 
nbh_cols = ['sibsp']
sibsp_nbh = raw_df[index_cols + nbh_cols].copy()
sibsp_l2 = sibsp_nbh[sibsp_nbh.sibsp == 2]
sibsp_l3 = sibsp_nbh[sibsp_nbh.sibsp == 3]
sibsp_l4 = sibsp_nbh[sibsp_nbh.sibsp == 4]
sibsp_l4p = sibsp_nbh[sibsp_nbh.sibsp >= 4]
# Pairs where sibsp value is the same 0-0, 1-1, 2-2, ... and so on
sibsp_nbh = (sibsp_nbh.merge(sibsp_nbh, on='sibsp', how='left')
              .rename(columns={'passengerid_x': 'src', 'passengerid_y': 'dest'}))
sibsp_nbh_23 = (sibsp_l2[['passengerid']].merge(sibsp_l3[['passengerid']], how='cross')
                 .rename(columns={'passengerid_x': 'src', 'passengerid_y': 'dest'}))
sibsp_nbh_23.values.sort()
# Remove duplicated
sibsp_nbh = sibsp_nbh.loc[sibsp_nbh.src < sibsp_nbh.dest, ['src', 'dest']]Generate neighbours overall edges
# Add neighbours edges between every 
nbh_edges = pd.concat([sex_nbh, pclass_nbh])
nbh_edges = nbh_edges.groupby(['src', 'dest']).sum().reset_index()
# In our first analysis, since we calculated edges over two variables we will say two instances are neighbours if they were close over each of the two variables
n_feats = 2
nbh_edges = nbh_edges.loc[nbh_edges.weight == n_feats]# Create the graph
nbh_nodes = list(raw_df.passengerid.unique())
nd_color = ['red' if surv == 1 else 'blue' for surv in raw_df.survived]
nbh = nx.Graph()
nbh.add_nodes_from(nbh_nodes)
nbh.add_edges_from([(src, dest) for src,dest in zip(nbh_edges.src, nbh_edges.dest)])Compute connected components
conn_comp = nx.connected_components(nbh)
dfs = list()
idx = 0
for c in conn_comp:
    dfs.append(pd.DataFrame(data={'passengerid':list(c), 'conn_comp_idx':idx}))
    idx += 1
nbh_tgt = pd.concat(dfs).sort_values('passengerid')
nbh_tgt = nbh_tgt.merge(raw_df[index_cols + tgt_col], on='passengerid', how='left')nbh_tgt.groupby('conn_comp_idx').agg({'survived': ['count', 'sum', 'mean']})| survived | |||
|---|---|---|---|
| count | sum | mean | |
| conn_comp_idx | |||
| 0 | 347 | 47 | 0.135447 | 
| 1 | 94 | 91 | 0.968085 | 
| 2 | 144 | 72 | 0.500000 | 
| 3 | 122 | 45 | 0.368852 | 
| 4 | 76 | 70 | 0.921053 | 
| 5 | 108 | 17 | 0.157407 | 
Visualization
# Create the graph visualization
plot_options = {"node_size": 10, "with_labels": False, "width": 0.15}
pos = nx.spring_layout(nbh, iterations=15, seed=1721)
fig, ax = plt.subplots(figsize=(15, 9))
ax.axis("off")
nx.draw_networkx(nbh, pos=pos, ax=ax, **plot_options)