In [1]:
# Import necessary libraries for data manipulation, visualization, and analysis
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from tqdm.notebook import tqdm
from mplsoccer import Pitch, VerticalPitch, lines
from matplotlib.colors import LinearSegmentedColormap
from scipy.ndimage import gaussian_filter

# Suppress warnings
warnings.filterwarnings("ignore")
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
#Load the pre created dataframes and concat them into one
stats0 = pd.read_pickle("bin_statistic2021.pkl")
stats1 = pd.read_pickle("bin_statistic2122.pkl")
stats2 = pd.read_pickle("bin_statistic2223.pkl")
stats3 = pd.read_pickle("bin_statistic2324.pkl")
stats4 = pd.read_pickle("bin_statistic2425.pkl")
In [4]:
player_stats_df = pd.concat([stats0, stats1, stats2, stats3, stats4])
In [5]:
player_stats_df
Out[5]:
player_id player_name team_id season_id statistic team_name
0 335468.0 Nahuel Bustos 893.0 1920.0 [[0.005519596275249199, 0.006888309438353184, ... Talleres
1 303656.0 Andrés Cubas 893.0 1920.0 [[0.0, 0.0, 0.0, 0.0, 4.442023635054891e-08, 1... Talleres
2 349761.0 Juan Ignacio Méndez 893.0 1920.0 [[0.0003457032533587284, 0.0005420103833096798... Talleres
3 330672.0 Facundo Medina 893.0 1920.0 [[0.00013495587774329734, 0.000167285062874743... Talleres
4 144307.0 Juan Cruz Komar 893.0 1920.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Talleres
... ... ... ... ... ... ...
9847 462097.0 Sota Kitahara 5973.0 2024.0 [[0.0011733655462917167, 0.0011315123881704301... Seattle
9848 477308.0 Adam Beaudry 1120.0 2024.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Colorado
9849 542900.0 Ervin Torres 29664.0 2024.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Austin FC
9850 328504.0 Emanuel Reynoso 9293.0 2024.0 [[0.0005024170892179867, 0.001031943622745739,... Minnesota United
9851 512799.0 Cyprian Kachwele 11134.0 2024.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Vancouver

50431 rows × 6 columns

In [6]:
from sklearn.preprocessing import StandardScaler
from umap import UMAP
from sklearn.mixture import GaussianMixture

# 1. Flatten the bin statistic arrays into feature vectors
print("Preparing features from bin statistics...")
features = []
for idx, row in player_stats_df.iterrows():
    # Flatten the 2D bin statistic array to a 1D feature vector
    flat_array = row['statistic'].flatten()
    features.append(flat_array)
Preparing features from bin statistics...
In [7]:
# 2. Create a numpy array of the features
X = np.array(features)
print(f"Created feature array with shape: {X.shape}")
Created feature array with shape: (50431, 7140)
In [8]:
# 3. Scale the data using StandardScaler
print("Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
Scaling features...
In [9]:
# 4. Apply UMAP to reduce dimensions
print("Applying UMAP for dimensionality reduction...")
umap_reducer = UMAP(min_dist=0.1, n_neighbors=50, random_state=2213)
comps = umap_reducer.fit_transform(X_scaled)
Applying UMAP for dimensionality reduction...
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
In [10]:
# 5. Use K-means which is designed for large datasets
from sklearn.metrics import calinski_harabasz_score

# Function to evaluate clusters quickly without silhouette score
def evaluate_clusters(data, max_k=30, min_k=10):
    ch_scores = []
    k_values = range(min_k, max_k+1)
    
    for k in k_values:
        kmeans = GaussianMixture(n_components=k, random_state=42)
        labels = kmeans.fit_predict(data)
        score = calinski_harabasz_score(data, labels)
        ch_scores.append(score)
    
    # Plot results
    plt.figure(figsize=(16, 6))
    plt.plot(k_values, ch_scores, 'bo-')
    plt.title('Calinski-Harabasz Score vs. Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Calinski-Harabasz Score')
    plt.grid(True)
    plt.show()
    
    return k_values, ch_scores

# Run evaluation with sample data
k_values, scores = evaluate_clusters(comps, max_k=100, min_k=2)
No description has been provided for this image
In [11]:
# 6. Store UMAP components in DataFrame
results_df = player_stats_df.copy()
results_df['umap_1'] = comps[:, 0]
results_df['umap_2'] = comps[:, 1]
In [12]:
# 7. Apply clustering if desired
n_clusters = 57  # Adjust as needed
kmeans = GaussianMixture(n_components=n_clusters, random_state=42)
clusters = kmeans.fit_predict(comps)
results_df['cluster'] = clusters
In [13]:
# 8. Plot the UMAP projection with cluster colors
fig, ax = plt.subplots(figsize=(16, 12))

# Create scatter plot colored by cluster
scatter = ax.scatter(
    results_df['umap_1'], 
    results_df['umap_2'], 
    c=results_df['cluster'],
    cmap='tab10',
    s=60,
    alpha=0.7
)

# Add a legend
legend1 = ax.legend(*scatter.legend_elements(),
                    loc="upper right", title="Clusters")
ax.add_artist(legend1)

# Set labels
ax.set_xlabel('UMAP Component 1')
ax.set_ylabel('UMAP Component 2')
ax.set_title('Player Clustering based on Spatial Distribution')

# Add grid
ax.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [14]:
#Checking the clusters are there
results_df
Out[14]:
player_id player_name team_id season_id statistic team_name umap_1 umap_2 cluster
0 335468.0 Nahuel Bustos 893.0 1920.0 [[0.005519596275249199, 0.006888309438353184, ... Talleres -0.546561 -1.598811 16
1 303656.0 Andrés Cubas 893.0 1920.0 [[0.0, 0.0, 0.0, 0.0, 4.442023635054891e-08, 1... Talleres -2.884587 3.906569 8
2 349761.0 Juan Ignacio Méndez 893.0 1920.0 [[0.0003457032533587284, 0.0005420103833096798... Talleres -2.089298 3.745713 45
3 330672.0 Facundo Medina 893.0 1920.0 [[0.00013495587774329734, 0.000167285062874743... Talleres 6.028358 9.185281 26
4 144307.0 Juan Cruz Komar 893.0 1920.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Talleres 2.460149 9.074166 12
... ... ... ... ... ... ... ... ... ...
9847 462097.0 Sota Kitahara 5973.0 2024.0 [[0.0011733655462917167, 0.0011315123881704301... Seattle 3.851492 2.358335 32
9848 477308.0 Adam Beaudry 1120.0 2024.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Colorado 11.430593 0.928354 17
9849 542900.0 Ervin Torres 29664.0 2024.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Austin FC 5.299849 0.418871 35
9850 328504.0 Emanuel Reynoso 9293.0 2024.0 [[0.0005024170892179867, 0.001031943622745739,... Minnesota United 4.226096 0.525148 0
9851 512799.0 Cyprian Kachwele 11134.0 2024.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Vancouver 5.480720 0.415572 35

50431 rows × 9 columns

In [16]:
def visualize_player_bin_statistic(player_stats_df, player_name, season_id, team_name):
    """
    Visualize a player's bin statistic heatmap on a pitch with correct orientation.
    
    Parameters:
    -----------
    player_stats_df : pandas DataFrame
        DataFrame containing the bin statistics
    player_name : str
        Name of the player to visualize
    """
    # Filter by both player_name
    filtered_df = player_stats_df[(player_stats_df['player_name'] == player_name) & (player_stats_df['team_name'] == team_name)
                                   & (player_stats_df['season_id'] == season_id)]
    
    if filtered_df.empty:
        print(f"No data found for player {player_name}")
        return
    
    # Get the player's bin statistic
    player_row = filtered_df.iloc[0]
    player_bin_stat = player_row['statistic']
    
    # Fix the inverted axes by flipping the bin statistic array both vertically and horizontally
    player_bin_stat = np.flipud(player_bin_stat)  # Fix y-axis
    player_bin_stat = np.fliplr(player_bin_stat)  # Fix x-axis
    
    # Create the pitch object with the same parameters as before
    pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', 
                 linewidth=1.25, line_color='#000000', line_zorder=2, pitch_color='#D7D1CF')
    
    # Create the figure and axes using pitch.grid
    fig, axs = pitch.grid(endnote_height=0.03, endnote_space=0, grid_width=0.88, 
                          left=0.025, title_height=0.06, title_space=0, axis=False,
                          grid_height=0.86)
    
    # Set the figure background color
    fig.set_facecolor('#D7D1CF')
    
    # Define the colormap with #D7D1CF as the lowest value
    cmap = LinearSegmentedColormap.from_list('custom_cmap', 
                                           ['#D7D1CF', '#FFFFFF', '#FFFF80', '#FF8000', '#800000', '#000000'], 
                                           N=256)
    
    # Create the bin statistic again to get the proper grid format
    temp_bin_stat = pitch.bin_statistic(
        [50], [34],  # Dummy values
        statistic='count', 
        bins=(105, 68)
    )
    
    # Replace the statistic with our flipped player statistic
    temp_bin_stat['statistic'] = player_bin_stat
    
    # Create the heatmap
    pcm = pitch.heatmap(temp_bin_stat, ax=axs['pitch'], cmap=cmap)
    
    # Add a title
    axs['pitch'].set_title(f"{player_name} Heatmap ({team_name} {season_id})", fontsize=14)
    
    # Show the plot
    plt.tight_layout()
    plt.show()
    
    return fig, axs
In [184]:
#Creating the dataframe of players from a certain cluster that play for those teams (easier to recognize and categorize their position)
C = results_df[results_df['cluster'] == 55]
X0 = C[C['team_name'].isin(['Juventus', 'Man City', 'Barcelona', 'Atalanta', 'Liverpool', 'Inter', 'AC Milan', 'Real Madrid', 
                            'Arsenal', 'Chelsea', 'Bayern Munich', 'Man Utd', 'PSG', 'Borussia Dortmund'])]
In [185]:
#Checking the list of players
X0
Out[185]:
player_id player_name team_id season_id statistic team_name umap_1 umap_2 cluster
3945 101955.0 Emerson 15.0 2021.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Chelsea 6.353710 5.213341 55
3965 301455.0 Kostas Tsimikas 26.0 2021.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Liverpool 5.903022 4.637537 55
4035 110260.0 Sead Kolasinac 13.0 2021.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Arsenal 6.572098 5.546768 55
4551 351252.0 Junior Firpo 65.0 2021.0 [[0.0, 0.0, 0.0, 4.1743925055426004e-06, 1.553... Barcelona 6.356042 5.374319 55
4854 312843.0 Felix Passlack 44.0 2021.0 [[0.00012287498161284947, 0.000158267145004108... Borussia Dortmund 6.024223 5.189990 55
4915 402046.0 Matteo Ruggeri 300.0 2021.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Atalanta 5.811248 4.294828 55
6477 106894.0 Juan Bernat 304.0 2021.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... PSG 5.829621 4.365310 55
3836 422938.0 Alejandro Balde 65.0 2122.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Barcelona 5.925271 4.543236 55
4439 141726.0 Robin Gosens 300.0 2122.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.3267812... Atalanta 6.401498 5.169967 55
4451 328968.0 Fodé Ballo-Touré 80.0 2122.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... AC Milan 6.539957 5.467077 55
5295 328968.0 Fodé Ballo-Touré 80.0 2223.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... AC Milan 6.510718 5.443084 55
5256 352830.0 Mitchel Bakker 300.0 2324.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Atalanta 6.426208 5.324160 55
2586 345303.0 Tyrell Malacia 32.0 2425.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Man Utd 6.283732 4.997286 55
2588 297403.0 Kieran Tierney 13.0 2425.0 [[0.0, 0.0, 0.0, 0.0, 6.037287731490949e-06, 2... Arsenal 6.116987 4.759197 55
5480 450283.0 Jonas Rouhi 87.0 2425.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Juventus 5.869138 4.780038 55
In [181]:
# Example usage of the visualization to know which position we are looking at:
visualize_player_bin_statistic(player_stats_df, 'João Félix', 2425, 'AC Milan')
No description has been provided for this image
Out[181]:
(<Figure size 1307.75x900 with 3 Axes>,
 {'pitch': <Axes: title={'center': 'João Félix Heatmap (AC Milan 2425)'}>,
  'title': <Axes: >,
  'endnote': <Axes: >})
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [186]:
# Define the mapping of old labels to new labels given my interpretation of the heatmaps
label_mapping = {
    0: 'Undefined',
    1: 'DM',
    2: 'LWB', 
    3: 'GK',
    4: 'RCB',
    5: 'AML',
    6: 'AWR',
    7: 'LCB',
    8: 'DM',
    9: 'RWB',
    10: 'AM',
    11: 'GK',
    12: 'RCB',
    13: 'AWL', 
    14: 'CB',
    15: 'LWB',
    16: 'ST',
    17: 'GK',
    18: 'SS',
    19: 'LCB',
    20: 'AMR',
    21: 'AWR',
    22: 'AMR',
    23: 'Undefined',
    24: 'AWL',
    25: 'HB',
    26: 'LCB',
    27: 'GK',
    28: 'RCB',
    29: 'Undefined', 
    30: 'AM',
    31: 'CB',
    32: 'Undefined',
    33: 'SS',
    34: 'RWB',
    35: 'Undefined',
    36: 'LCB',
    37: 'CM',
    38: 'GK',
    39: 'RCB',
    40: 'CM', 
    41: 'ST',
    42: 'SS',
    43: 'RWB',
    44: 'RCB',
    45: 'DM',
    46: 'AM',
    47: 'CM', 
    48: 'AWR',
    49: 'ST',
    50: 'GK',
    51: 'Undefined',
    52: 'DM',
    53: 'AML',
    54: 'RWB',
    55: 'LWB',
    56: 'LCB'
}
In [187]:
# Map the labels directly
results_df['cluster'] = results_df['cluster'].map(label_mapping)
In [188]:
#Creating the final table with players and positions
position = results_df.groupby(["player_name", "player_id", "team_name", "season_id"], observed=True)['cluster'].unique().reset_index(name='position')
position['position'] = position['position'].astype(str)
position['position'] = position['position'].str.strip("['")
position['position'] = position['position'].str.strip("']")
In [189]:
#Check for unique positions
position.position.unique()
Out[189]:
array(['CB', 'RWB', 'CM', 'LCB', 'ST', 'RCB', 'Undefined', 'SS', 'AWL',
       'GK', 'AML', 'LWB', 'DM', 'AMR', 'AWR', 'AM', 'HB'], dtype=object)
In [191]:
# Define the position group function
def map_position_group(pos):
    if pos in ['CB', 'LCB', 'RCB']:
        return 'CB'
    elif pos in ['RWB', 'LWB']:
        return 'WB'
    elif pos in ['CM', 'DM', 'HB']:
        return 'CDM'
    elif pos in ['AWL', 'AML', 'AMR', 'AWR', 'AM']:
        return 'AMW'
    elif pos in ['ST', 'SS']:
        return 'ST'
    elif pos in ['GK']:
        return 'GK'
    else:
        return 'Undefined'

# Create a new column for position groups
position['position_group'] = position['position'].apply(map_position_group)
position.position_group.unique()
Out[191]:
array(['CB', 'WB', 'CDM', 'ST', 'Undefined', 'AMW', 'GK'], dtype=object)
In [192]:
#Showing the table
position
Out[192]:
player_name player_id team_name season_id position position_group
0 AJ Delagarza JR 72638.0 Inter Miami CF 2020.0 CB CB
1 AJ Delagarza JR 72638.0 New England 2021.0 RWB WB
2 AJ Delagarza JR 72638.0 New England 2022.0 RWB WB
3 Aapo Halme 202262.0 Barnsley 2021.0 CM CDM
4 Aapo Halme 202262.0 Barnsley 2122.0 CB CB
... ... ... ... ... ... ...
50420 Úmaro Embaló 455568.0 Rio Ave 2324.0 LWB WB
50421 Úmaro Embaló 455568.0 Vitoria de Guimaraes 2425.0 DM CDM
50422 Übeyd Adiyaman 136467.0 Genclerbirligi 2021.0 GK GK
50423 Ümit Akdag 438512.0 Alanyaspor 2223.0 Undefined Undefined
50424 Ümit Akdag 438512.0 Toulouse 2425.0 LCB CB

50425 rows × 6 columns

In [193]:
#Saving the table
position.to_csv("clustered_position.csv")
In [ ]: