In [1]:
# Import necessary libraries for data manipulation, visualization, and analysis
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from tqdm.notebook import tqdm
from mplsoccer import Pitch, VerticalPitch, lines
from matplotlib.colors import LinearSegmentedColormap
from scipy.ndimage import gaussian_filter
# Suppress warnings
warnings.filterwarnings("ignore")
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
name='SourceSansPro-SemiBold'
)
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
#Load the pre created dataframes and concat them into one
stats0 = pd.read_pickle("bin_statistic2021.pkl")
stats1 = pd.read_pickle("bin_statistic2122.pkl")
stats2 = pd.read_pickle("bin_statistic2223.pkl")
stats3 = pd.read_pickle("bin_statistic2324.pkl")
stats4 = pd.read_pickle("bin_statistic2425.pkl")
In [4]:
player_stats_df = pd.concat([stats0, stats1, stats2, stats3, stats4])
In [5]:
player_stats_df
Out[5]:
player_id | player_name | team_id | season_id | statistic | team_name | |
---|---|---|---|---|---|---|
0 | 335468.0 | Nahuel Bustos | 893.0 | 1920.0 | [[0.005519596275249199, 0.006888309438353184, ... | Talleres |
1 | 303656.0 | Andrés Cubas | 893.0 | 1920.0 | [[0.0, 0.0, 0.0, 0.0, 4.442023635054891e-08, 1... | Talleres |
2 | 349761.0 | Juan Ignacio Méndez | 893.0 | 1920.0 | [[0.0003457032533587284, 0.0005420103833096798... | Talleres |
3 | 330672.0 | Facundo Medina | 893.0 | 1920.0 | [[0.00013495587774329734, 0.000167285062874743... | Talleres |
4 | 144307.0 | Juan Cruz Komar | 893.0 | 1920.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Talleres |
... | ... | ... | ... | ... | ... | ... |
9847 | 462097.0 | Sota Kitahara | 5973.0 | 2024.0 | [[0.0011733655462917167, 0.0011315123881704301... | Seattle |
9848 | 477308.0 | Adam Beaudry | 1120.0 | 2024.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Colorado |
9849 | 542900.0 | Ervin Torres | 29664.0 | 2024.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Austin FC |
9850 | 328504.0 | Emanuel Reynoso | 9293.0 | 2024.0 | [[0.0005024170892179867, 0.001031943622745739,... | Minnesota United |
9851 | 512799.0 | Cyprian Kachwele | 11134.0 | 2024.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Vancouver |
50431 rows × 6 columns
In [6]:
from sklearn.preprocessing import StandardScaler
from umap import UMAP
from sklearn.mixture import GaussianMixture
# 1. Flatten the bin statistic arrays into feature vectors
print("Preparing features from bin statistics...")
features = []
for idx, row in player_stats_df.iterrows():
# Flatten the 2D bin statistic array to a 1D feature vector
flat_array = row['statistic'].flatten()
features.append(flat_array)
Preparing features from bin statistics...
In [7]:
# 2. Create a numpy array of the features
X = np.array(features)
print(f"Created feature array with shape: {X.shape}")
Created feature array with shape: (50431, 7140)
In [8]:
# 3. Scale the data using StandardScaler
print("Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
Scaling features...
In [9]:
# 4. Apply UMAP to reduce dimensions
print("Applying UMAP for dimensionality reduction...")
umap_reducer = UMAP(min_dist=0.1, n_neighbors=50, random_state=2213)
comps = umap_reducer.fit_transform(X_scaled)
Applying UMAP for dimensionality reduction...
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
In [10]:
# 5. Use K-means which is designed for large datasets
from sklearn.metrics import calinski_harabasz_score
# Function to evaluate clusters quickly without silhouette score
def evaluate_clusters(data, max_k=30, min_k=10):
ch_scores = []
k_values = range(min_k, max_k+1)
for k in k_values:
kmeans = GaussianMixture(n_components=k, random_state=42)
labels = kmeans.fit_predict(data)
score = calinski_harabasz_score(data, labels)
ch_scores.append(score)
# Plot results
plt.figure(figsize=(16, 6))
plt.plot(k_values, ch_scores, 'bo-')
plt.title('Calinski-Harabasz Score vs. Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Calinski-Harabasz Score')
plt.grid(True)
plt.show()
return k_values, ch_scores
# Run evaluation with sample data
k_values, scores = evaluate_clusters(comps, max_k=100, min_k=2)
In [11]:
# 6. Store UMAP components in DataFrame
results_df = player_stats_df.copy()
results_df['umap_1'] = comps[:, 0]
results_df['umap_2'] = comps[:, 1]
In [12]:
# 7. Apply clustering if desired
n_clusters = 57 # Adjust as needed
kmeans = GaussianMixture(n_components=n_clusters, random_state=42)
clusters = kmeans.fit_predict(comps)
results_df['cluster'] = clusters
In [13]:
# 8. Plot the UMAP projection with cluster colors
fig, ax = plt.subplots(figsize=(16, 12))
# Create scatter plot colored by cluster
scatter = ax.scatter(
results_df['umap_1'],
results_df['umap_2'],
c=results_df['cluster'],
cmap='tab10',
s=60,
alpha=0.7
)
# Add a legend
legend1 = ax.legend(*scatter.legend_elements(),
loc="upper right", title="Clusters")
ax.add_artist(legend1)
# Set labels
ax.set_xlabel('UMAP Component 1')
ax.set_ylabel('UMAP Component 2')
ax.set_title('Player Clustering based on Spatial Distribution')
# Add grid
ax.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
In [14]:
#Checking the clusters are there
results_df
Out[14]:
player_id | player_name | team_id | season_id | statistic | team_name | umap_1 | umap_2 | cluster | |
---|---|---|---|---|---|---|---|---|---|
0 | 335468.0 | Nahuel Bustos | 893.0 | 1920.0 | [[0.005519596275249199, 0.006888309438353184, ... | Talleres | -0.546561 | -1.598811 | 16 |
1 | 303656.0 | Andrés Cubas | 893.0 | 1920.0 | [[0.0, 0.0, 0.0, 0.0, 4.442023635054891e-08, 1... | Talleres | -2.884587 | 3.906569 | 8 |
2 | 349761.0 | Juan Ignacio Méndez | 893.0 | 1920.0 | [[0.0003457032533587284, 0.0005420103833096798... | Talleres | -2.089298 | 3.745713 | 45 |
3 | 330672.0 | Facundo Medina | 893.0 | 1920.0 | [[0.00013495587774329734, 0.000167285062874743... | Talleres | 6.028358 | 9.185281 | 26 |
4 | 144307.0 | Juan Cruz Komar | 893.0 | 1920.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Talleres | 2.460149 | 9.074166 | 12 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9847 | 462097.0 | Sota Kitahara | 5973.0 | 2024.0 | [[0.0011733655462917167, 0.0011315123881704301... | Seattle | 3.851492 | 2.358335 | 32 |
9848 | 477308.0 | Adam Beaudry | 1120.0 | 2024.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Colorado | 11.430593 | 0.928354 | 17 |
9849 | 542900.0 | Ervin Torres | 29664.0 | 2024.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Austin FC | 5.299849 | 0.418871 | 35 |
9850 | 328504.0 | Emanuel Reynoso | 9293.0 | 2024.0 | [[0.0005024170892179867, 0.001031943622745739,... | Minnesota United | 4.226096 | 0.525148 | 0 |
9851 | 512799.0 | Cyprian Kachwele | 11134.0 | 2024.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Vancouver | 5.480720 | 0.415572 | 35 |
50431 rows × 9 columns
In [16]:
def visualize_player_bin_statistic(player_stats_df, player_name, season_id, team_name):
"""
Visualize a player's bin statistic heatmap on a pitch with correct orientation.
Parameters:
-----------
player_stats_df : pandas DataFrame
DataFrame containing the bin statistics
player_name : str
Name of the player to visualize
"""
# Filter by both player_name
filtered_df = player_stats_df[(player_stats_df['player_name'] == player_name) & (player_stats_df['team_name'] == team_name)
& (player_stats_df['season_id'] == season_id)]
if filtered_df.empty:
print(f"No data found for player {player_name}")
return
# Get the player's bin statistic
player_row = filtered_df.iloc[0]
player_bin_stat = player_row['statistic']
# Fix the inverted axes by flipping the bin statistic array both vertically and horizontally
player_bin_stat = np.flipud(player_bin_stat) # Fix y-axis
player_bin_stat = np.fliplr(player_bin_stat) # Fix x-axis
# Create the pitch object with the same parameters as before
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box',
linewidth=1.25, line_color='#000000', line_zorder=2, pitch_color='#D7D1CF')
# Create the figure and axes using pitch.grid
fig, axs = pitch.grid(endnote_height=0.03, endnote_space=0, grid_width=0.88,
left=0.025, title_height=0.06, title_space=0, axis=False,
grid_height=0.86)
# Set the figure background color
fig.set_facecolor('#D7D1CF')
# Define the colormap with #D7D1CF as the lowest value
cmap = LinearSegmentedColormap.from_list('custom_cmap',
['#D7D1CF', '#FFFFFF', '#FFFF80', '#FF8000', '#800000', '#000000'],
N=256)
# Create the bin statistic again to get the proper grid format
temp_bin_stat = pitch.bin_statistic(
[50], [34], # Dummy values
statistic='count',
bins=(105, 68)
)
# Replace the statistic with our flipped player statistic
temp_bin_stat['statistic'] = player_bin_stat
# Create the heatmap
pcm = pitch.heatmap(temp_bin_stat, ax=axs['pitch'], cmap=cmap)
# Add a title
axs['pitch'].set_title(f"{player_name} Heatmap ({team_name} {season_id})", fontsize=14)
# Show the plot
plt.tight_layout()
plt.show()
return fig, axs
In [184]:
#Creating the dataframe of players from a certain cluster that play for those teams (easier to recognize and categorize their position)
C = results_df[results_df['cluster'] == 55]
X0 = C[C['team_name'].isin(['Juventus', 'Man City', 'Barcelona', 'Atalanta', 'Liverpool', 'Inter', 'AC Milan', 'Real Madrid',
'Arsenal', 'Chelsea', 'Bayern Munich', 'Man Utd', 'PSG', 'Borussia Dortmund'])]
In [185]:
#Checking the list of players
X0
Out[185]:
player_id | player_name | team_id | season_id | statistic | team_name | umap_1 | umap_2 | cluster | |
---|---|---|---|---|---|---|---|---|---|
3945 | 101955.0 | Emerson | 15.0 | 2021.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Chelsea | 6.353710 | 5.213341 | 55 |
3965 | 301455.0 | Kostas Tsimikas | 26.0 | 2021.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Liverpool | 5.903022 | 4.637537 | 55 |
4035 | 110260.0 | Sead Kolasinac | 13.0 | 2021.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Arsenal | 6.572098 | 5.546768 | 55 |
4551 | 351252.0 | Junior Firpo | 65.0 | 2021.0 | [[0.0, 0.0, 0.0, 4.1743925055426004e-06, 1.553... | Barcelona | 6.356042 | 5.374319 | 55 |
4854 | 312843.0 | Felix Passlack | 44.0 | 2021.0 | [[0.00012287498161284947, 0.000158267145004108... | Borussia Dortmund | 6.024223 | 5.189990 | 55 |
4915 | 402046.0 | Matteo Ruggeri | 300.0 | 2021.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Atalanta | 5.811248 | 4.294828 | 55 |
6477 | 106894.0 | Juan Bernat | 304.0 | 2021.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | PSG | 5.829621 | 4.365310 | 55 |
3836 | 422938.0 | Alejandro Balde | 65.0 | 2122.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Barcelona | 5.925271 | 4.543236 | 55 |
4439 | 141726.0 | Robin Gosens | 300.0 | 2122.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.3267812... | Atalanta | 6.401498 | 5.169967 | 55 |
4451 | 328968.0 | Fodé Ballo-Touré | 80.0 | 2122.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | AC Milan | 6.539957 | 5.467077 | 55 |
5295 | 328968.0 | Fodé Ballo-Touré | 80.0 | 2223.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | AC Milan | 6.510718 | 5.443084 | 55 |
5256 | 352830.0 | Mitchel Bakker | 300.0 | 2324.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Atalanta | 6.426208 | 5.324160 | 55 |
2586 | 345303.0 | Tyrell Malacia | 32.0 | 2425.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Man Utd | 6.283732 | 4.997286 | 55 |
2588 | 297403.0 | Kieran Tierney | 13.0 | 2425.0 | [[0.0, 0.0, 0.0, 0.0, 6.037287731490949e-06, 2... | Arsenal | 6.116987 | 4.759197 | 55 |
5480 | 450283.0 | Jonas Rouhi | 87.0 | 2425.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Juventus | 5.869138 | 4.780038 | 55 |
In [181]:
# Example usage of the visualization to know which position we are looking at:
visualize_player_bin_statistic(player_stats_df, 'João Félix', 2425, 'AC Milan')
Out[181]:
(<Figure size 1307.75x900 with 3 Axes>, {'pitch': <Axes: title={'center': 'João Félix Heatmap (AC Milan 2425)'}>, 'title': <Axes: >, 'endnote': <Axes: >})
In [ ]:
In [ ]:
In [ ]:
In [186]:
# Define the mapping of old labels to new labels given my interpretation of the heatmaps
label_mapping = {
0: 'Undefined',
1: 'DM',
2: 'LWB',
3: 'GK',
4: 'RCB',
5: 'AML',
6: 'AWR',
7: 'LCB',
8: 'DM',
9: 'RWB',
10: 'AM',
11: 'GK',
12: 'RCB',
13: 'AWL',
14: 'CB',
15: 'LWB',
16: 'ST',
17: 'GK',
18: 'SS',
19: 'LCB',
20: 'AMR',
21: 'AWR',
22: 'AMR',
23: 'Undefined',
24: 'AWL',
25: 'HB',
26: 'LCB',
27: 'GK',
28: 'RCB',
29: 'Undefined',
30: 'AM',
31: 'CB',
32: 'Undefined',
33: 'SS',
34: 'RWB',
35: 'Undefined',
36: 'LCB',
37: 'CM',
38: 'GK',
39: 'RCB',
40: 'CM',
41: 'ST',
42: 'SS',
43: 'RWB',
44: 'RCB',
45: 'DM',
46: 'AM',
47: 'CM',
48: 'AWR',
49: 'ST',
50: 'GK',
51: 'Undefined',
52: 'DM',
53: 'AML',
54: 'RWB',
55: 'LWB',
56: 'LCB'
}
In [187]:
# Map the labels directly
results_df['cluster'] = results_df['cluster'].map(label_mapping)
In [188]:
#Creating the final table with players and positions
position = results_df.groupby(["player_name", "player_id", "team_name", "season_id"], observed=True)['cluster'].unique().reset_index(name='position')
position['position'] = position['position'].astype(str)
position['position'] = position['position'].str.strip("['")
position['position'] = position['position'].str.strip("']")
In [189]:
#Check for unique positions
position.position.unique()
Out[189]:
array(['CB', 'RWB', 'CM', 'LCB', 'ST', 'RCB', 'Undefined', 'SS', 'AWL', 'GK', 'AML', 'LWB', 'DM', 'AMR', 'AWR', 'AM', 'HB'], dtype=object)
In [191]:
# Define the position group function
def map_position_group(pos):
if pos in ['CB', 'LCB', 'RCB']:
return 'CB'
elif pos in ['RWB', 'LWB']:
return 'WB'
elif pos in ['CM', 'DM', 'HB']:
return 'CDM'
elif pos in ['AWL', 'AML', 'AMR', 'AWR', 'AM']:
return 'AMW'
elif pos in ['ST', 'SS']:
return 'ST'
elif pos in ['GK']:
return 'GK'
else:
return 'Undefined'
# Create a new column for position groups
position['position_group'] = position['position'].apply(map_position_group)
position.position_group.unique()
Out[191]:
array(['CB', 'WB', 'CDM', 'ST', 'Undefined', 'AMW', 'GK'], dtype=object)
In [192]:
#Showing the table
position
Out[192]:
player_name | player_id | team_name | season_id | position | position_group | |
---|---|---|---|---|---|---|
0 | AJ Delagarza JR | 72638.0 | Inter Miami CF | 2020.0 | CB | CB |
1 | AJ Delagarza JR | 72638.0 | New England | 2021.0 | RWB | WB |
2 | AJ Delagarza JR | 72638.0 | New England | 2022.0 | RWB | WB |
3 | Aapo Halme | 202262.0 | Barnsley | 2021.0 | CM | CDM |
4 | Aapo Halme | 202262.0 | Barnsley | 2122.0 | CB | CB |
... | ... | ... | ... | ... | ... | ... |
50420 | Úmaro Embaló | 455568.0 | Rio Ave | 2324.0 | LWB | WB |
50421 | Úmaro Embaló | 455568.0 | Vitoria de Guimaraes | 2425.0 | DM | CDM |
50422 | Übeyd Adiyaman | 136467.0 | Genclerbirligi | 2021.0 | GK | GK |
50423 | Ümit Akdag | 438512.0 | Alanyaspor | 2223.0 | Undefined | Undefined |
50424 | Ümit Akdag | 438512.0 | Toulouse | 2425.0 | LCB | CB |
50425 rows × 6 columns
In [193]:
#Saving the table
position.to_csv("clustered_position.csv")
In [ ]: