In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.colors as mcolors
import matplotlib.patheffects as path_effects
from matplotlib.cm import get_cmap
import seaborn as sns
from tqdm.auto import tqdm
from mplsoccer import Pitch, VerticalPitch, lines
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from PIL import Image
import urllib
import time
from joblib import Parallel, delayed, parallel_backend
import socceraction
import socceraction.atomic.spadl as atomicspadl
import socceraction.spadl as spadl
import multiprocessing
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import warnings
# Suppress unnecessary warnings
warnings.filterwarnings('ignore')
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
name='SourceSansPro-SemiBold'
)
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
season = 2425
In [4]:
# Load light datasets from CSV files
VAEP = pd.read_csv("aVAEPactions.csv", index_col = 0)
xP = pd.read_csv("xPactions.csv", index_col = 0)
fotmob = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
positions = pd.read_csv("clustered_position.csv", index_col = 0)
In [5]:
# Load events datasets from CSV files
aactions = pd.read_csv(f"atomic_actions{season}.csv", index_col = 0)
actions = pd.read_csv(f"actions{season}.csv", index_col = 0)
In [6]:
# Load to be modified datasets from CSV files
players = pd.read_csv(f"players{season}.csv", index_col = 0)
players_info = players[['game_id', 'team_id', 'player_id', 'competition_id', 'season_id']]
recoveries = pd.read_csv(f"recoveries_id{season}.csv", index_col = 0)
recoveries.rename(columns = {'event_id':'original_event_id'}, inplace = True)
In [7]:
# Add descriptive action names to the atomic actions DataFrame
aactions = atomicspadl.add_names(aactions)
In [8]:
# Add descriptive action names to the actions DataFrame
actions = spadl.add_names(actions)
In [9]:
# Merge datasets to create a unified DataFrame
dfa = (
aactions
.merge(players_info, how="left")
.merge(fotmob, how="left")
.merge(VAEP, how="left"))
In [10]:
# Merge datasets to create a unified DataFrame
dfb = (
actions
.merge(players_info, how="left")
.merge(fotmob, how="left")
.merge(xP, how="left")
.merge(positions, how="left"))
In [11]:
#Adding on necessary frame a set of features that we need to define progressive actions
dfb["beginning_distance"] = np.sqrt(np.square(105-dfb['start_x_a0']) + np.square(34-dfb['start_y_a0'])).round(2)
dfb["end_distance"] = np.sqrt(np.square(105-dfb['end_x_a0']) + np.square(34-dfb['end_y_a0'])).round(2)
dfb["length"] = dfb["end_distance"] - dfb["beginning_distance"]
dfb['length'] = dfb['length'].abs()
dfb["angle"] = np.arctan2(dfb["end_y_a0"] - dfb["start_y_a0"], dfb["end_x_a0"] - dfb["start_x_a0"])
dfb['angle_degrees'] = np.degrees(dfb['angle']) % 360
In [12]:
#Adding on previous actions infos to filter later on
dfb["prev_type_name"] = dfb.shift(+1, fill_value=0)["type_name"]
dfb["prev_result_name"] = dfb.shift(+1, fill_value=0)["result_name"]
In [13]:
#Flagging progressive actions given a custom defintion of passes not starting in the box, with lenght of more than 5m
#not backwards or horizontal and the pass closes the distance from starting point to center of the goal by at least 17.5%
dfb['progressive'] = np.where(
((dfb['beginning_distance'] - dfb['end_distance']) / dfb['beginning_distance'] >= 0.175) & (dfb['length'] > 5) &
(((dfb['angle_degrees'] >= 0) & (dfb['angle_degrees'] <= 60)) | ((dfb['angle_degrees'] >= 260) & (dfb['angle_degrees'] <= 360))) &
~((dfb['start_x_a0'] >= 88.5) & (dfb['start_y_a0'] >= 13.885) & (dfb['start_y_a0'] <= 54.115)),
True, False)
In [14]:
# Function to format season ID into a readable format
def format_season_id(season_id):
# Convert to integer if it's a float
season_id = int(season_id)
# Extract the last two digits of the year
start_year = str(season_id -1)[-2:]
# Calculate the end year
end_year = str(season_id)[-2:]
# Format as 20/21
formatted_season = f"{start_year}/{end_year}"
return formatted_season
In [15]:
#Keeping only passes in both dataframe
df1a = dfa[dfa["type_name"] == 'pass']
df1b = dfb[dfb["type_name"] == 'pass']
#Creating an outcome column and then calculating completed (1) and not (0) passes minus probability (xP)
df1b['outcome'] = np.where((df1b["result_name"] == 'success'), True, False)
df1b['PAx'] = df1b['outcome'] - df1b['xP']
In [16]:
#Keeping a dataframe from each filtered dataframe for merging reasons later on
fa = df1a.filter(items=['game_id', 'original_event_id', 'team_id', 'vaep_value'])
fb = df1b.filter(items=['game_id', 'original_event_id', 'result_name', 'team_id', 'xP', 'PAx'])
In [17]:
#Reformatting original event id column in all dataframes which is needed for merging
fa['original_event_id'] = fa['original_event_id'].astype(int)
fb['original_event_id'] = fb['original_event_id'].astype(int)
df1a['original_event_id'] = df1a['original_event_id'].astype(int)
df1b['original_event_id'] = df1b['original_event_id'].astype(int)
In [18]:
#Actual merging operations to define our dataframe
df = df1b.merge(fa, how="left")
df = df[df['season_id'].notnull()]
In [19]:
# Apply the function to the 'season_id' column
df['formatted_season'] = df['season_id'].apply(format_season_id)
In [20]:
#Exploring unique positions to create a grouping function
df.position_group.unique()
Out[20]:
array(['CDM', 'AMW', 'WB', 'CB', 'ST', 'GK', 'Undefined'], dtype=object)
In [21]:
#Eliminating non footed passes, passes at the start of the half and after goals (to restart play)
df = df[df["bodypart_name"] == 'foot']
df = df[df["time_seconds"] != 0.0]
df = df[~((df["prev_type_name"] == 'shot') & (df["prev_result_name"] == 'success')) & (df["prev_result_name"] != 'owngoal')]
In [22]:
#Selecting player position and filtering the dataframe
dfx = df[df["position_group"] == 'AMW']
In [23]:
#Selecting features to consider in the clustering process
dfc = dfx.filter(items=['player_name', 'start_x_a0', 'start_y_a0', 'end_x_a0', 'end_y_a0', 'length', 'angle'])
X = dfc[['start_x_a0', 'start_y_a0', 'end_x_a0', 'end_y_a0', 'length', 'angle']].values
scaled_X = StandardScaler().fit_transform(X)
In [24]:
len(scaled_X)
Out[24]:
803913
In [25]:
print("Starting coherence-focused clustering for football pass data...")
print(f"Input data dimensions: {scaled_X.shape}")
start_total_time = time.time()
# Step 1: Create a representative sample for initial testing
print("\n--- STEP 1: Creating Representative Sample ---")
start_time = time.time()
# Take a larger sample to better represent the full range of pass types
sample_size = min(round(len(scaled_X)*0.5), 50000) # Lower cap to focus on quality over quantity
print(f"Taking a representative sample of {sample_size} points for initial testing")
# Take a stratified sample if possible, otherwise random
if scaled_X.shape[0] > sample_size:
sample_indices = np.random.choice(scaled_X.shape[0], size=sample_size, replace=False)
X_sample = scaled_X[sample_indices]
else:
X_sample = scaled_X
sample_indices = np.arange(scaled_X.shape[0])
print(f"Sampling completed in {time.time() - start_time:.2f}s")
Starting coherence-focused clustering for football pass data... Input data dimensions: (803913, 6) --- STEP 1: Creating Representative Sample --- Taking a representative sample of 50000 points for initial testing Sampling completed in 0.01s
In [26]:
# Step 2: Find optimal number of clusters with focus on coherence
print("\n--- STEP 2: Finding Optimal Number of Clusters Focused on Coherence ---")
start_time = time.time()
# Define a wide range for football pass patterns - large range to find best coherence
min_clusters = 100 # Start higher to avoid under-clustering
max_clusters = 500 # Allow for many clusters to achieve high coherence
step_size = 2 # Larger initial steps to cover the range efficiently
cluster_range = list(range(min_clusters, max_clusters + 1, step_size))
print(f"Testing {len(cluster_range)} different cluster counts from {min_clusters} to {max_clusters}")
# Evaluation function prioritizing within-cluster coherence
def evaluate_kmeans(n_clusters, X):
start = time.time()
# For coherence, we need stricter parameters
kmeans = MiniBatchKMeans(
n_clusters=n_clusters,
batch_size=1024, # Smaller batch size for more precise clusters
init='k-means++',
max_iter=500, # More iterations to ensure convergence
n_init=10, # More initializations to find cohesive clusters
random_state=42,
tol=1e-5 # Tighter tolerance for better convergence
)
# Fit model
kmeans.fit(X)
labels = kmeans.labels_
# Calculate cluster stats
sizes = np.bincount(labels)
min_size = sizes.min()
max_size = sizes.max()
mean_size = sizes.mean()
# Calculate size distribution stats - smaller clusters allowed for higher coherence
size_std = np.std(sizes)
size_cv = size_std / mean_size # Coefficient of variation
# Calculate inertia (key for coherence - lower means more cohesive clusters)
inertia = kmeans.inertia_
# Calculate normalized inertia (per cluster to account for different cluster counts)
normalized_inertia = inertia / n_clusters
# Calculate silhouette score (and others)
sil_score = None
ch_score = None
db_score = None
try:
# For large datasets, calculate metrics on a subsample
if X.shape[0] > 10000:
subsample_size = min(10000, len(X)//10)
subsample_indices = np.random.choice(X.shape[0], size=subsample_size, replace=False)
sil_score = silhouette_score(X[subsample_indices], labels[subsample_indices])
# Skip CH and DB - they tend to favor fewer clusters, which reduces coherence
else:
sil_score = silhouette_score(X, labels)
except Exception as e:
print(f"Error calculating scores for k={n_clusters}: {str(e)}")
# Calculate a coherence-focused score (higher = better)
coherence_score = None
if sil_score is not None:
# This formula explicitly prioritizes:
# 1. Silhouette score (measures how similar points are to their own cluster vs other clusters)
# 2. Lower normalized inertia (tighter clusters)
# 3. Avoids extremely small clusters (tiny clusters may not be meaningful)
# Normalize the inertia to 0-1 range (estimated based on typical values)
norm_inertia_factor = 1.0 - min(1.0, normalized_inertia / 1000)
# Check for too-small clusters (penalize if smallest cluster is too tiny)
small_cluster_penalty = 0.0
if min_size < 5: # Penalize clusters with fewer than 5 points
small_cluster_penalty = 0.2
coherence_score = (
0.7 * sil_score + # Silhouette heavily weighted - direct measure of coherence
0.3 * norm_inertia_factor - # Lower inertia per cluster = more coherent
small_cluster_penalty # Penalty for extremely small clusters
)
elapsed = time.time() - start
return {
'n_clusters': n_clusters,
'model': kmeans,
'labels': labels,
'min_cluster_size': min_size,
'max_cluster_size': max_size,
'mean_cluster_size': mean_size,
'size_cv': size_cv,
'silhouette': sil_score,
'calinski_harabasz': ch_score,
'davies_bouldin': db_score,
'normalized_inertia': normalized_inertia,
'coherence_score': coherence_score,
'inertia': inertia,
'runtime': elapsed
}
# Parallelize the evaluation - use 10 CPUs as my pc has 12
n_cpus = multiprocessing.cpu_count()
n_jobs = max(1, min(n_cpus - 1, 10))
print(f"Using {n_jobs} CPU cores for parallel processing")
# Run evaluations in parallel
with parallel_backend('loky', n_jobs=n_jobs):
results = Parallel(verbose=1)(
delayed(evaluate_kmeans)(k, X_sample) for k in cluster_range
)
# Create results dictionary
cluster_results = {r['n_clusters']: r for r in results}
print(f"\nInitial evaluation completed in {time.time() - start_time:.2f}s")
--- STEP 2: Finding Optimal Number of Clusters Focused on Coherence --- Testing 201 different cluster counts from 100 to 500 Using 10 CPU cores for parallel processing
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers. [Parallel(n_jobs=10)]: Done 30 tasks | elapsed: 3.0s [Parallel(n_jobs=10)]: Done 180 tasks | elapsed: 15.1s
Initial evaluation completed in 17.28s
[Parallel(n_jobs=10)]: Done 201 out of 201 | elapsed: 17.3s finished
In [27]:
# Step 3: Visualize evaluation metrics focused on coherence
print("\n--- STEP 3: Visualizing Coherence Metrics ---")
start_time = time.time()
# Extract results for plotting
cluster_counts = sorted(cluster_results.keys())
silhouette_scores = [cluster_results[k]['silhouette'] for k in cluster_counts]
inertia_values = [cluster_results[k]['inertia'] for k in cluster_counts]
coherence_scores = [cluster_results[k]['coherence_score'] if 'coherence_score' in cluster_results[k] else None for k in cluster_counts]
norm_inertia_values = [cluster_results[k]['normalized_inertia'] if 'normalized_inertia' in cluster_results[k] else None for k in cluster_counts]
min_sizes = [cluster_results[k]['min_cluster_size'] for k in cluster_counts]
# Create subplots focused on coherence metrics
fig, axs = plt.subplots(2, 2, figsize=(16, 14))
# Plot silhouette scores - higher values = more coherent clusters
axs[0, 0].plot(cluster_counts, silhouette_scores, 'o-', color='blue')
axs[0, 0].set_title('Silhouette Score vs. Number of Clusters')
axs[0, 0].set_xlabel('Number of Clusters')
axs[0, 0].set_ylabel('Silhouette Score (higher = more coherent)')
axs[0, 0].grid(True, linestyle='--', alpha=0.7)
# Highlight best silhouette score
if None not in silhouette_scores:
best_silhouette_idx = np.argmax(silhouette_scores)
best_k_silhouette = cluster_counts[best_silhouette_idx]
best_silhouette = silhouette_scores[best_silhouette_idx]
axs[0, 0].axvline(x=best_k_silhouette, color='red', linestyle='--', alpha=0.7)
axs[0, 0].plot(best_k_silhouette, best_silhouette, 'ro', ms=10)
axs[0, 0].annotate(f'Best: k={best_k_silhouette}, score={best_silhouette:.3f}',
xy=(best_k_silhouette, best_silhouette),
xytext=(best_k_silhouette+10, best_silhouette),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
# Plot normalized inertia (per cluster) - lower values = tighter, more coherent clusters
if None not in norm_inertia_values:
axs[0, 1].plot(cluster_counts, norm_inertia_values, 'o-', color='green')
axs[0, 1].set_title('Normalized Inertia vs. Number of Clusters')
axs[0, 1].set_xlabel('Number of Clusters')
axs[0, 1].set_ylabel('Inertia per Cluster (lower = more coherent)')
axs[0, 1].grid(True, linestyle='--', alpha=0.7)
# Highlight best normalized inertia (lowest value)
best_norm_inertia_idx = np.argmin(norm_inertia_values)
best_k_norm_inertia = cluster_counts[best_norm_inertia_idx]
best_norm_inertia = norm_inertia_values[best_norm_inertia_idx]
axs[0, 1].axvline(x=best_k_norm_inertia, color='red', linestyle='--', alpha=0.7)
axs[0, 1].plot(best_k_norm_inertia, best_norm_inertia, 'ro', ms=10)
axs[0, 1].annotate(f'Best: k={best_k_norm_inertia}',
xy=(best_k_norm_inertia, best_norm_inertia),
xytext=(best_k_norm_inertia+10, best_norm_inertia),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
# Plot coherence score (combined metric optimized for coherence)
if None not in coherence_scores:
axs[1, 0].plot(cluster_counts, coherence_scores, 'o-', color='purple')
axs[1, 0].set_title('Coherence Score vs. Number of Clusters')
axs[1, 0].set_xlabel('Number of Clusters')
axs[1, 0].set_ylabel('Coherence Score (higher = better)')
axs[1, 0].grid(True, linestyle='--', alpha=0.7)
# Highlight best coherence score
best_coherence_idx = np.argmax(coherence_scores)
best_k_coherence = cluster_counts[best_coherence_idx]
best_coherence = coherence_scores[best_coherence_idx]
axs[1, 0].axvline(x=best_k_coherence, color='red', linestyle='--', alpha=0.7)
axs[1, 0].plot(best_k_coherence, best_coherence, 'ro', ms=10)
axs[1, 0].annotate(f'Best: k={best_k_coherence}, score={best_coherence:.3f}',
xy=(best_k_coherence, best_coherence),
xytext=(best_k_coherence+10, best_coherence),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
# Plot smallest cluster size - important for coherence evaluation
axs[1, 1].plot(cluster_counts, min_sizes, 'o-', color='orange')
axs[1, 1].set_title('Smallest Cluster Size vs. Number of Clusters')
axs[1, 1].set_xlabel('Number of Clusters')
axs[1, 1].set_ylabel('Points in Smallest Cluster')
axs[1, 1].grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
print(f"Visualization completed in {time.time() - start_time:.2f}s")
--- STEP 3: Visualizing Coherence Metrics ---
Visualization completed in 0.41s
In [28]:
# Step 4: Refined search for maximum coherence
print("\n--- STEP 4: Refined Search for Maximum Coherence ---")
start_time = time.time()
# Identify the best clusters based on coherence metrics
best_k_values = []
# Silhouette score is an excellent direct measure of coherence
if None not in silhouette_scores:
# Get the top 3 silhouette scores
sil_indices = np.argsort(silhouette_scores)[-3:]
for idx in sil_indices:
best_k_values.append(cluster_counts[idx])
# Normalized inertia is another good indicator of coherence
if None not in norm_inertia_values:
# Get the top 3 normalized inertia values (lowest)
norm_inertia_indices = np.argsort(norm_inertia_values)[:3]
for idx in norm_inertia_indices:
best_k_values.append(cluster_counts[idx])
# Add the best coherence score if available
if None not in coherence_scores:
best_k_values.append(cluster_counts[best_coherence_idx])
# For coherence, median is a good approach (not biasing toward either extreme)
optimal_k_approx = int(np.median(best_k_values))
print(f"Candidate optimal cluster counts based on coherence: {best_k_values}")
print(f"Median optimal cluster count: {optimal_k_approx}")
# For coherence, we need a narrower search to really find the sweet spot
fine_min = max(50, optimal_k_approx - 15)
fine_max = min(500, optimal_k_approx + 15)
fine_step = 1 # Single-step precision for finding the most coherent clustering
fine_cluster_range = list(range(fine_min, fine_max + 1, fine_step))
print(f"Performing fine-grained coherence search around k={optimal_k_approx}")
print(f"Testing cluster counts: {fine_min} to {fine_max} in steps of {fine_step}")
# Run fine-grained evaluations in parallel
with parallel_backend('loky', n_jobs=n_jobs):
fine_results = Parallel(verbose=1)(
delayed(evaluate_kmeans)(k, X_sample) for k in fine_cluster_range if k not in cluster_results
)
# Add new results to the dictionary
for r in fine_results:
cluster_results[r['n_clusters']] = r
# Update plotting data for coherence metrics
cluster_counts = sorted(cluster_results.keys())
silhouette_scores = [cluster_results[k]['silhouette'] for k in cluster_counts]
coherence_scores = [cluster_results[k].get('coherence_score', None) for k in cluster_counts]
# Determine the optimal k by prioritizing coherence score, then silhouette
if None not in coherence_scores:
best_coherence_idx = np.argmax(coherence_scores)
optimal_k = cluster_counts[best_coherence_idx]
optimal_score = coherence_scores[best_coherence_idx]
print(f"Using coherence score to determine optimal k={optimal_k}, score={optimal_score:.3f}")
else:
# Fall back to silhouette if coherence score is unavailable
best_silhouette_idx = np.argmax(silhouette_scores)
optimal_k = cluster_counts[best_silhouette_idx]
optimal_score = silhouette_scores[best_silhouette_idx]
print(f"Using silhouette score to determine optimal k={optimal_k}, score={optimal_score:.3f}")
print(f"\nFine-grained search completed in {time.time() - start_time:.2f}s")
print(f"Optimal number of clusters for maximum coherence: k={optimal_k}")
if cluster_results[optimal_k]['silhouette'] is not None:
print(f"Silhouette score at k={optimal_k}: {cluster_results[optimal_k]['silhouette']:.3f}")
if 'coherence_score' in cluster_results[optimal_k] and cluster_results[optimal_k]['coherence_score'] is not None:
print(f"Coherence score at k={optimal_k}: {cluster_results[optimal_k]['coherence_score']:.3f}")
--- STEP 4: Refined Search for Maximum Coherence --- Candidate optimal cluster counts based on coherence: [108, 114, 100, 498, 496, 500, 326] Median optimal cluster count: 326 Performing fine-grained coherence search around k=326 Testing cluster counts: 311 to 341 in steps of 1
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
Using coherence score to determine optimal k=326, score=0.405 Fine-grained search completed in 1.75s Optimal number of clusters for maximum coherence: k=326 Silhouette score at k=326: 0.169 Coherence score at k=326: 0.405
[Parallel(n_jobs=10)]: Done 14 out of 16 | elapsed: 1.6s remaining: 0.2s [Parallel(n_jobs=10)]: Done 16 out of 16 | elapsed: 1.6s finished
In [29]:
# Step 5: Visualize final results
print("\n--- STEP 5: Visualizing Final Results ---")
start_time = time.time()
# Create updated plots with fine-grained results
fig, axs = plt.subplots(2, 2, figsize=(15, 12))
# Make sure we're using the correct variables (coherence_scores instead of custom_scores)
# Sort data by cluster count for proper plotting
sorted_indices = np.argsort(cluster_counts)
sorted_cluster_counts = np.array(cluster_counts)[sorted_indices]
sorted_silhouette = np.array(silhouette_scores)[sorted_indices]
# Plot silhouette scores with zoomed region
axs[0, 0].plot(sorted_cluster_counts, sorted_silhouette, 'o-', color='blue', alpha=0.4)
axs[0, 0].set_title('Silhouette Score vs. Number of Clusters')
axs[0, 0].set_xlabel('Number of Clusters')
axs[0, 0].set_ylabel('Silhouette Score (higher = more coherent)')
axs[0, 0].grid(True, linestyle='--', alpha=0.7)
axs[0, 0].axvline(x=optimal_k, color='red', linestyle='--', alpha=0.7)
# Zoom in on the region around optimal_k for silhouette
zoom_min = max(0, np.where(sorted_cluster_counts >= optimal_k - 50)[0][0])
zoom_max = min(len(sorted_cluster_counts) - 1, np.where(sorted_cluster_counts <= optimal_k + 50)[0][-1])
zoom_x = sorted_cluster_counts[zoom_min:zoom_max+1]
zoom_y = sorted_silhouette[zoom_min:zoom_max+1]
axs[0, 1].plot(zoom_x, zoom_y, 'o-', color='blue')
axs[0, 1].set_title(f'Silhouette Score (Zoomed on k={optimal_k})')
axs[0, 1].set_xlabel('Number of Clusters')
axs[0, 1].set_ylabel('Silhouette Score')
axs[0, 1].grid(True, linestyle='--', alpha=0.7)
axs[0, 1].axvline(x=optimal_k, color='red', linestyle='--', alpha=0.7)
axs[0, 1].plot(optimal_k, cluster_results[optimal_k]['silhouette'], 'ro', ms=10)
# Plot coherence scores if available
valid_coherence_scores = [score for score in coherence_scores if score is not None]
if len(valid_coherence_scores) > 0:
# Filter out None values and get corresponding cluster counts
valid_indices = [i for i, score in enumerate(coherence_scores) if score is not None]
valid_counts = [cluster_counts[i] for i in valid_indices]
sorted_c_indices = np.argsort(valid_counts)
# Sort coherence scores by cluster count
sorted_c_counts = np.array(valid_counts)[sorted_c_indices]
sorted_coherence = np.array(valid_coherence_scores)[sorted_c_indices]
# Plot full range
axs[1, 0].plot(sorted_c_counts, sorted_coherence, 'o-', color='magenta', alpha=0.4)
axs[1, 0].set_title('Coherence Score vs. Number of Clusters')
axs[1, 0].set_xlabel('Number of Clusters')
axs[1, 0].set_ylabel('Coherence Score (higher = better)')
axs[1, 0].grid(True, linestyle='--', alpha=0.7)
axs[1, 0].axvline(x=optimal_k, color='red', linestyle='--', alpha=0.7)
# Try to plot zoomed view if we have enough data points around optimal_k
if 'coherence_score' in cluster_results[optimal_k] and cluster_results[optimal_k]['coherence_score'] is not None:
# Find coherence scores near the optimal_k
zoom_coherence_counts = [c for c in sorted_c_counts if optimal_k-50 <= c <= optimal_k+50]
if len(zoom_coherence_counts) > 0:
zoom_coherence_indices = [i for i, c in enumerate(sorted_c_counts) if c in zoom_coherence_counts]
zoom_c_x = sorted_c_counts[zoom_coherence_indices]
zoom_c_y = sorted_coherence[zoom_coherence_indices]
axs[1, 1].plot(zoom_c_x, zoom_c_y, 'o-', color='magenta')
axs[1, 1].set_title(f'Coherence Score (Zoomed on k={optimal_k})')
axs[1, 1].set_xlabel('Number of Clusters')
axs[1, 1].set_ylabel('Coherence Score')
axs[1, 1].grid(True, linestyle='--', alpha=0.7)
axs[1, 1].axvline(x=optimal_k, color='red', linestyle='--', alpha=0.7)
axs[1, 1].plot(optimal_k, cluster_results[optimal_k]['coherence_score'], 'ro', ms=10)
else:
print("No valid coherence scores available for plotting")
plt.tight_layout()
plt.show()
print(f"Visualization completed in {time.time() - start_time:.2f}s")
--- STEP 5: Visualizing Final Results ---
Visualization completed in 0.21s
In [30]:
# Step 6: Apply best model to full dataset
print(f"\n--- STEP 6: Clustering Full Dataset with k={optimal_k} ---")
start_time = time.time()
print(f"Applying MiniBatchKMeans with {optimal_k} clusters to the full football pass dataset...")
# Create final model with optimal number of clusters
final_kmeans = MiniBatchKMeans(
n_clusters=optimal_k,
batch_size=min(4096, len(scaled_X)//20), # Adjust batch size based on dataset size
init='k-means++',
max_iter=300,
n_init=5,
random_state=42
)
# Process very large datasets in batches
batch_size = 50000
if scaled_X.shape[0] > 1000000: # For extremely large datasets
print("Using partial_fit for extremely large dataset...")
# Initialize centroids with a sample
final_kmeans.partial_fit(X_sample)
# Process remaining data in batches
for i in tqdm(range(0, scaled_X.shape[0], batch_size)):
end = min(i + batch_size, scaled_X.shape[0])
if i not in sample_indices: # Skip points already used for initialization
final_kmeans.partial_fit(scaled_X[i:end])
# Get labels for all points
print("Predicting cluster labels for all points...")
labels = np.zeros(scaled_X.shape[0], dtype=int)
for i in tqdm(range(0, scaled_X.shape[0], batch_size)):
end = min(i + batch_size, scaled_X.shape[0])
labels[i:end] = final_kmeans.predict(scaled_X[i:end])
else:
# For moderately large datasets, fit directly
print("Fitting model on full dataset...")
final_kmeans.fit(scaled_X)
labels = final_kmeans.labels_
print(f"Full dataset clustering completed in {time.time() - start_time:.2f}s")
--- STEP 6: Clustering Full Dataset with k=326 --- Applying MiniBatchKMeans with 326 clusters to the full football pass dataset... Fitting model on full dataset... Full dataset clustering completed in 1.86s
In [31]:
# Step 7: Analyze and interpret football pass clusters
print("\n--- STEP 7: Analyzing Football Pass Cluster Results ---")
# Calculate cluster sizes
cluster_sizes = np.bincount(labels)
sorted_indices = np.argsort(cluster_sizes)[::-1] # Sort by size (descending)
print(f"Number of football pass clusters: {optimal_k}")
print(f"Largest cluster: {cluster_sizes.max()} passes ({cluster_sizes.max()/len(labels)*100:.2f}%)")
print(f"Smallest cluster: {cluster_sizes.min()} passes ({cluster_sizes.min()/len(labels)*100:.2f}%)")
print(f"Average cluster size: {cluster_sizes.mean():.1f} passes")
print(f"Median cluster size: {np.median(cluster_sizes):.1f} passes")
# Display info about top clusters
print("\nLargest football pass clusters:")
for i, idx in enumerate(sorted_indices[:10]):
size = cluster_sizes[idx]
percentage = size / len(labels) * 100
print(f"Cluster {idx}: {size} passes ({percentage:.2f}%)")
if len(sorted_indices) > 10:
print(f"... and {len(sorted_indices)-10} more pass clusters")
# Visualize cluster size distribution
plt.figure(figsize=(12, 6))
plt.bar(range(len(cluster_sizes)), cluster_sizes[sorted_indices], alpha=0.7)
plt.title(f'Football Pass Cluster Size Distribution (k={optimal_k})')
plt.xlabel('Cluster Rank (by size)')
plt.ylabel('Number of Passes')
plt.grid(True, linestyle='--', alpha=0.5, axis='y')
plt.tight_layout()
plt.show()
# Calculate total runtime
total_runtime = time.time() - start_total_time
print(f"\nTotal football pass clustering pipeline runtime: {total_runtime:.2f} seconds")
# Assign cluster labels to original dataframe
dfx['cluster'] = labels
print("\nFootball pass clustering complete!")
print(f"Cluster labels have been added to 'dfx' dataframe as 'cluster' column")
print(f"Optimal number of clusters for football passing analysis: {optimal_k}")
--- STEP 7: Analyzing Football Pass Cluster Results --- Number of football pass clusters: 326 Largest cluster: 7208 passes (0.90%) Smallest cluster: 327 passes (0.04%) Average cluster size: 2466.0 passes Median cluster size: 2362.0 passes Largest football pass clusters: Cluster 0: 7208 passes (0.90%) Cluster 58: 6792 passes (0.84%) Cluster 14: 6288 passes (0.78%) Cluster 67: 6067 passes (0.75%) Cluster 4: 5964 passes (0.74%) Cluster 209: 5682 passes (0.71%) Cluster 241: 5636 passes (0.70%) Cluster 202: 5517 passes (0.69%) Cluster 217: 5440 passes (0.68%) Cluster 200: 5427 passes (0.68%) ... and 316 more pass clusters
Total football pass clustering pipeline runtime: 21.71 seconds Football pass clustering complete! Cluster labels have been added to 'dfx' dataframe as 'cluster' column Optimal number of clusters for football passing analysis: 326
In [32]:
#Finding passes 5th and 95th percentile to assign cmap in visualization
x_min = dfx.vaep_value.quantile(0.05)
x_max = dfx.vaep_value.quantile(0.95)
In [33]:
#Get the whole list of players in the data
playerlist = dfx['player_name'].unique().tolist()
cleaned_playerlist = [name for name in playerlist if pd.notna(name)]
cleaned_playerlist.sort()
In [34]:
from IPython.display import display, HTML
# Generate the HTML dropdown to easily search for players
options_html = ''.join([f'<option value="{name}">{name}</option>' for name in cleaned_playerlist])
dropdown_html = f"""
<input list="players" id="dropdown" oninput="handleInput()" placeholder="Choose Someone">
<datalist id="players">
{options_html}
</datalist>
<p id="output"></p>
<script>
function handleInput() {{
var input = document.getElementById("dropdown").value;
var output = document.getElementById("output");
output.innerHTML = "Selected: " + input;
}}
</script>
"""
# Display the dropdown
display(HTML(dropdown_html))
In [35]:
#Selecting player and filtering the dataframe
dfp = dfx[dfx["player_name"] == 'Rayan Cherki']
In [36]:
#Creating the def to plot 4 different pitches and selected passes into them
def plot_passes(ax, cluster_name):
plot_df = dfx[(dfx['cluster'] == cluster_name)]
pitch = Pitch(
pitch_type='custom',
pitch_width=68,
pitch_length=105,
goal_type='box',
linewidth=2,
line_color='black',
half=False)
pitch.draw(ax = ax)
cmap = matplotlib.colormaps.get_cmap('afmhot_r')
vmin = x_min # or set manually, e.g., vmin = 0
vmax = x_max # or set manually, e.g., vmax = 1
# Normalize the data to the specified range
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
# Apply the colormap and normalization to your data
colors = cmap(norm(dfx['vaep_value']))
pitch.arrows(plot_df.start_x_a0, plot_df.start_y_a0, plot_df.end_x_a0, plot_df.end_y_a0, width=2.5, alpha=0.8,
headwidth=10, headlength=8, color=colors, label='succcessful passes', ax=ax)
ax.annotate(
xy=(50,72.5),
text=f"Passes: {order['attempted_passes'].iloc[index]} | Expected - Actual Completion % (PAx100) : {(order['PAx100'].iloc[index]).round(2)}",
size=20,
color='black',
ha='center',
va='center',
weight='bold',
annotation_clip=False
)
# Annotation with data coordinates and offset points
ax.annotate(
text="",
xy=(65, 69.5), # Target point on the axes
xytext=(-200, 0), # Offset of the text relative to the target point
textcoords="offset points",
size=27,
color="#000000",
arrowprops=dict(arrowstyle="-|>", shrinkA=0, color="black", linewidth=2)
)
# Create and customize the colorbar
cbar_ax = fig.add_axes([0.9, 0.25, 0.02, 0.5]) # Adjust position and size of colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([]) # Set the array for the ScalarMappable
cbar = plt.colorbar(sm, cax=cbar_ax)
# Add title to the colorbar
cbar.ax.set_title('attempted_passes', fontsize=20, pad=20, rotation=0, loc='center')
# Adjust font size of the colorbar tick labels
cbar.ax.tick_params(labelsize=15)
return ax
In [37]:
# Grouping the DataFrame by 'cluster' and number of successful and not passes
order_teams0 = (dfx.groupby(["cluster"], observed=True)['outcome'].sum().reset_index(name='successful_passes'))
# Grouping the DataFrame by 'cluster' and total number of passes
order_teams1 = (dfx.groupby(["cluster"], observed=True)['outcome'].count().reset_index(name='attempted_passes'))
# Grouping the DataFrame by 'cluster' and total xP
order_teams2 = (dfx.groupby(["cluster"], observed=True)['xP'].sum().reset_index())
# Grouping the DataFrame by 'cluster' and median vaep value inside the subset
order_teams3 = (dfx.groupby(["cluster"], observed=True)['vaep_value'].median().reset_index())
# Merging the two grouped DataFrames on 'cluster'
order = (order_teams0
.merge(order_teams1, left_on='cluster', right_on='cluster')
.merge(order_teams2, left_on='cluster', right_on='cluster')
.merge(order_teams3, left_on='cluster', right_on='cluster'))
# Calculating the success %, expected % and the difference for each cluster
order['success_pct'] = ((order['successful_passes'] * 100) / order['attempted_passes'])
order['xP_pct'] = (order['xP'] / order['attempted_passes']) * 100
order['PAx100'] = (((order['successful_passes'] - order['xP']) / order['attempted_passes']) * 100)
# Sorting the DataFrame in descending order and resetting the index
order = (order.sort_values(by=['vaep_value'], ascending=False).reset_index(drop=True))
order
Out[37]:
cluster | successful_passes | attempted_passes | xP | vaep_value | success_pct | xP_pct | PAx100 | |
---|---|---|---|---|---|---|---|---|
0 | 236 | 1946 | 2590 | 1813.219668 | 0.018239 | 75.135135 | 70.008481 | 5.126654 |
1 | 94 | 1787 | 2416 | 1828.837343 | 0.015238 | 73.965232 | 75.696910 | -1.731678 |
2 | 10 | 2312 | 3250 | 2464.107256 | 0.014835 | 71.138462 | 75.818685 | -4.680223 |
3 | 237 | 1585 | 2682 | 2068.813586 | 0.014308 | 59.097688 | 77.136972 | -18.039284 |
4 | 113 | 2834 | 3642 | 2584.631934 | 0.014041 | 77.814388 | 70.967379 | 6.847009 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
321 | 180 | 2239 | 2366 | 1829.358303 | -0.009789 | 94.632291 | 77.318610 | 17.313681 |
322 | 321 | 1904 | 2054 | 1567.126787 | -0.010228 | 92.697176 | 76.296338 | 16.400838 |
323 | 308 | 1994 | 2140 | 1610.955580 | -0.010957 | 93.177570 | 75.278298 | 17.899272 |
324 | 13 | 22 | 501 | 231.886987 | -0.015254 | 4.391218 | 46.284828 | -41.893610 |
325 | 105 | 37 | 587 | 284.406141 | -0.015378 | 6.303237 | 48.450791 | -42.147554 |
326 rows × 8 columns
In [38]:
#Creating fig, axes and launch the def
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(40,25), dpi=200)
axs = np.array(axs)
for index, ax in enumerate(axs.reshape(-1)):
plot_passes(ax, order['cluster'].iloc[index])
# Adjust the space between subplots
plt.subplots_adjust(wspace=0.001, hspace=0.0001)
# Adding some text to better understand what the plot represents
plt.text(0.5, 0.93, f'Top 4 Most Performed Passes Clusters by {', '.join(dfx['position_group'].unique())} in {season}',
transform=fig.transFigure, horizontalalignment='center', fontsize=45)
plt.text(0.5, 0.91, f"Ordered by pass frequency",
transform=fig.transFigure, horizontalalignment='center', fontsize=25)
fig.suptitle(f'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com',
x=0.5, y=0.097, fontsize=20, verticalalignment='bottom')
Out[38]:
Text(0.5, 0.097, 'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com')
In [ ]:
In [ ]:
In [ ]:
In [39]:
#Creating the def to plot 4 different pitches and selected passes into them for specific player
def plot_passes_player(ax, cluster_name):
plot_df = dfp[(dfp['cluster'] == cluster_name)]
pitch = Pitch(
pitch_type='custom',
pitch_width=68,
pitch_length=105,
goal_type='box',
linewidth=2,
line_color='black',
half=False)
pitch.draw(ax = ax)
cmap = matplotlib.colormaps.get_cmap('afmhot_r')
vmin = x_min # or set manually, e.g., vmin = 0
vmax = x_max # or set manually, e.g., vmax = 1
# Normalize the data to the specified range
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
# Apply the colormap and normalization to your data
colors = cmap(norm(dfx['vaep_value']))
pitch.arrows(plot_df.start_x_a0, plot_df.start_y_a0, plot_df.end_x_a0, plot_df.end_y_a0, width=2.5, alpha=0.8,
headwidth=10, headlength=8, color=colors, label='succcessful passes', ax=ax)
ax.annotate(
xy=(50,72.5),
text=f"Passes: {order_player['attempted_passes'].iloc[index]} | Expected - Actual Completion % (PAx100) : {(order_player['PAx100'].iloc[index]).round(2)}",
size=20,
color='black',
ha='center',
va='center',
weight='bold',
annotation_clip=False
)
# Annotation with data coordinates and offset points
ax.annotate(
text="",
xy=(65, 69.5), # Target point on the axes
xytext=(-200, 0), # Offset of the text relative to the target point
textcoords="offset points",
size=27,
color="#000000",
arrowprops=dict(arrowstyle="-|>", shrinkA=0, color="black", linewidth=2)
)
# Create and customize the colorbar
cbar_ax = fig.add_axes([0.9, 0.25, 0.02, 0.5]) # Adjust position and size of colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([]) # Set the array for the ScalarMappable
cbar = plt.colorbar(sm, cax=cbar_ax)
# Add title to the colorbar
cbar.ax.set_title('ATOMIC-VAEP', fontsize=20, pad=20, rotation=0, loc='center')
# Adjust font size of the colorbar tick labels
cbar.ax.tick_params(labelsize=15)
return ax
In [40]:
# Grouping the DataFrame by 'cluster' and number of successful and not passes
order_player0 = (dfp.groupby(["cluster"], observed=True)['outcome'].sum().reset_index(name='successful_passes'))
# Grouping the DataFrame by 'cluster' and total number of passes
order_player1 = (dfp.groupby(["cluster"], observed=True)['outcome'].count().reset_index(name='attempted_passes'))
# Grouping the DataFrame by 'cluster' and total xP
order_player2 = (dfp.groupby(["cluster"], observed=True)['xP'].sum().reset_index())
# Grouping the DataFrame by 'cluster' and median vaep value inside the subset
order_player3 = (dfp.groupby(["cluster"], observed=True)['vaep_value'].median().reset_index())
# Merging the two grouped DataFrames on 'cluster'
order_player = (order_player0
.merge(order_player1, left_on='cluster', right_on='cluster')
.merge(order_player2, left_on='cluster', right_on='cluster')
.merge(order_player3, left_on='cluster', right_on='cluster'))
# Calculating the success %, expected % and difference for each cluster
order_player['success_pct'] = ((order_player['successful_passes'] * 100) / order_player['attempted_passes'])
order_player['xP_pct'] = (order_player['xP'] / order_player['attempted_passes']) * 100
order_player['PAx100'] = (((order_player['successful_passes'] - order_player['xP']) / order_player['attempted_passes']) * 100)
# Sorting the DataFrame in descending order and resetting the index
order_player = (order_player.sort_values(by=['attempted_passes'], ascending=False).reset_index(drop=True))
order_player
Out[40]:
cluster | successful_passes | attempted_passes | xP | vaep_value | success_pct | xP_pct | PAx100 | |
---|---|---|---|---|---|---|---|---|
0 | 237 | 17 | 22 | 16.915477 | 0.013659 | 77.272727 | 76.888530 | 0.384197 |
1 | 10 | 16 | 20 | 15.702311 | 0.012405 | 80.000000 | 78.511555 | 1.488445 |
2 | 113 | 13 | 18 | 12.316653 | 0.009773 | 72.222222 | 68.425849 | 3.796373 |
3 | 29 | 18 | 18 | 15.090296 | -0.006214 | 100.000000 | 83.834977 | 16.165023 |
4 | 58 | 18 | 18 | 15.563379 | -0.002910 | 100.000000 | 86.463219 | 13.536781 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
291 | 25 | 1 | 1 | 0.886431 | -0.002829 | 100.000000 | 88.643140 | 11.356860 |
292 | 24 | 1 | 1 | 0.750909 | -0.001206 | 100.000000 | 75.090945 | 24.909055 |
293 | 276 | 0 | 1 | 0.818571 | 0.000842 | 0.000000 | 81.857090 | -81.857090 |
294 | 277 | 1 | 1 | 0.882073 | -0.000296 | 100.000000 | 88.207290 | 11.792710 |
295 | 250 | 1 | 1 | 0.859577 | 0.002061 | 100.000000 | 85.957740 | 14.042260 |
296 rows × 8 columns
In [41]:
#Creating fig, axes and launching the def
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(40,25), dpi=200)
axs = np.array(axs)
for index, ax in enumerate(axs.reshape(-1)):
plot_passes_player(ax, order_player['cluster'].iloc[index])
# Adjust the space between subplots
plt.subplots_adjust(wspace=0.001, hspace=0.0001)
# Adding text to the plot
plt.text(0.5, 0.93, f'{', '.join(dfp['player_name'].unique())} - Top 4 Most Performed Passing Patterns for',
transform=fig.transFigure, horizontalalignment='center', fontsize=45)
plt.text(0.5, 0.91, f"Ordered by pass frequency | {', '.join(dfp['competition_id'].unique())} {', '.join(dfp['formatted_season'].unique())}",
transform=fig.transFigure, horizontalalignment='center', fontsize=25)
fig.suptitle(f'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com',
x=0.5, y=0.097, fontsize=20, verticalalignment='bottom')
#Adding the logo of the team
fotmob_url = "https://images.fotmob.com/image_resources/logo/teamlogo/"
logo_ax = fig.add_axes([.2, 0.9, 0.08, 0.08], zorder=1)
club_icon = Image.open(urllib.request.urlopen(f"{fotmob_url}{dfp['fotmob_id'].iloc[0]}.png"))
logo_ax.imshow(club_icon)
logo_ax.axis("off")
# Save the figure with adjusted face color and transparency
plt.savefig(f'{', '.join(dfp['player_name'].unique())}-passingclusters-{season}.png',
dpi=500, facecolor="#D7D1CF", bbox_inches="tight", transparent=True)
In [ ]:
In [ ]:
In [ ]:
In [42]:
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
try:
# Convert to string in case it's a float (e.g., NaN)
time_str = str(time_str)
# Split the time string into minutes and seconds
minutes, seconds = map(int, time_str.split(':'))
# Convert total time to minutes (seconds converted to fraction of minutes)
return minutes + seconds / 60
except (ValueError, AttributeError):
# Handle cases where the conversion fails (e.g., NaN or bad format)
return 0 # or use `np.nan` if you prefer to mark as missing
# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)
In [43]:
#Merge player dataframe with team infos dataframe
players0 = players.merge(fotmob)
In [44]:
#Creating a table with minutes played in season by players
mp = players0.groupby(["player_name", "team_name"])["minutes_played_converted"].sum().reset_index(name='minutes_played')
In [45]:
# Sorting the DataFrame by atomic vaep value in descending order to keep 10% most valuable
order = (order.sort_values(by=['vaep_value'], ascending=False).reset_index(drop=True))
#Setting the list of those 10% clusters
most_valuable = order.head(int(optimal_k/10)).cluster.unique()
In [46]:
#Keeping the 10% of most valuable clusters
A = dfx[dfx['cluster'].isin(most_valuable)]
In [47]:
# Grouping the DataFrame by 'cluster' and number of successful and not passes
A0 = (A.groupby(["player_name", "team_name"], observed=True)['outcome'].sum().reset_index(name='successful_passes'))
# Grouping the DataFrame by 'cluster' and total number of passes
A1 = (A.groupby(["player_name", "team_name"], observed=True)['outcome'].count().reset_index(name='attempted_passes'))
# Grouping the DataFrame by 'cluster' and total xP
A2 = (A.groupby(["player_name", "team_name"], observed=True)['xP'].sum().reset_index())
# Grouping the DataFrame by 'cluster' and sum of vaep
A3 = (A.groupby(["player_name", "team_name"], observed=True)['vaep_value'].sum().reset_index())
# Merging the two grouped DataFrames on 'cluster'
AA = (A0
.merge(A1, left_on=["player_name", "team_name"], right_on=["player_name", "team_name"])
.merge(A2, left_on=["player_name", "team_name"], right_on=["player_name", "team_name"])
.merge(A3, left_on=["player_name", "team_name"], right_on=["player_name", "team_name"]))
# Calculating the success percentage for each cluster
AA['success_pct'] = ((AA['successful_passes'] * 100) / AA['attempted_passes'])
AA['xP_pct'] = (AA['xP'] / AA['attempted_passes']) * 100
AA['PAx100'] = (((AA['successful_passes'] - AA['xP']) / AA['attempted_passes']) * 100).round(2)
In [48]:
minutesadj = players0.groupby(["game_id", "game_duration"], observed=True)['is_starter'].count().reset_index(name='is_starter')
# Apply the conversion function to the 'minutes_played' column
minutesadj['game_duration_converted'] = minutesadj['game_duration'].apply(convert_to_minutes)
minutesadj = minutesadj.game_duration_converted.median()
minutesadj
Out[48]:
98.43333333333334
In [49]:
#Merging with minutes played for players and calculating the final metrics we want
AA0 = AA.merge(mp, how='left')
AA0["passes_98"] = ((AA0.attempted_passes * minutesadj) / AA0.minutes_played).round(2)
AA0["vaep_value_98"] = ((AA0.vaep_value * minutesadj) / AA0.minutes_played).round(3)
In [50]:
#Keeping players with at least 500 minutes
AA1 = AA0[AA0['minutes_played'] >= 500]
#Filter the columns we want to keep
AA2 = AA1[['player_name', 'team_name', 'PAx100', 'passes_98', 'vaep_value_98']]
In [51]:
#Selecting the metric we want to analyze and visualize
metric = 'vaep_value_98'
In [52]:
# Selecting top 10
AAFa = (AA2.sort_values(by=[metric], ascending=False).reset_index(drop=True)).head(10)
#Resorting since the visualization works inverted
AAF = (AAFa.sort_values(by=[metric], ascending=True).reset_index(drop=True))
AAF
Out[52]:
player_name | team_name | PAx100 | passes_98 | vaep_value_98 | |
---|---|---|---|---|---|
0 | Abdellah Zoubir | Qarabag FK | -4.44 | 7.27 | 0.101 |
1 | Michael Olise | Bayern | -5.58 | 8.43 | 0.103 |
2 | Lazar Samardzic | Atalanta | -6.06 | 8.17 | 0.103 |
3 | Raheem Sterling | Arsenal | -14.91 | 5.53 | 0.104 |
4 | Nicolas Kühn | Celtic | -13.72 | 7.35 | 0.110 |
5 | Rayan Cherki | Lyon | -5.21 | 9.82 | 0.115 |
6 | Martin Ødegaard | Arsenal | -8.99 | 8.90 | 0.117 |
7 | Luciano Acosta | FC Cincinnati | -8.22 | 11.07 | 0.120 |
8 | Lionel Messi | Inter Miami CF | -13.39 | 12.42 | 0.142 |
9 | Luke McCowan | Celtic | -2.05 | 8.65 | 0.174 |
In [53]:
#Setting the figure, the axes and the dimension of the figure to make it all fit pleasingly
fig = plt.figure(figsize=(1800/500, 1800/500), dpi=500)
ax = plt.subplot()
ncols = AAF.shape[1]
nrows = AAF.shape[0]
ax.set_xlim(0, ncols + 1)
ax.set_ylim(0, nrows + 1)
positions = [0.1, 3.4, 4.8]
columns = ['player_name', 'team_name', metric]
#Conditioning for names in different columns
for i in range(nrows):
for j, column in enumerate(columns):
if j == 0:
ha = 'left'
else:
ha = 'center'
if column == metric:
fontsize = 10
color = '#FFFFFF'
fontname = fe_semibold.name
elif column == 'team_name':
fontsize = 4
color = '#4E616C'
fontname = fe_regular.name
else:
fontsize = 11
color = '#000000'
fontname = fe_semibold.name
ax.annotate(
xy=(positions[j], i + .5), text=str(AAF[column].iloc[i]), ha=ha, va='center', fontsize=fontsize, color=color, fontname=fontname)
# Add dividing lines and color for the column to highlight
ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [nrows, nrows], lw=1.5, color='black', marker='', zorder=4)
ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [0, 0], lw=1.5, color='black', marker='', zorder=4)
for x in range(1, nrows):
ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [x, x], lw=0.5, color='gray', ls='-', zorder=3 , marker='')
ax.fill_between(x=[4.2, 5.4], y1=nrows, y2=0, color='#D32F2F', alpha=0.5, ec='None')
# Adding titles and notes
if metric == 'vaep_value_98':
plt.text(0.5, 0.86, 'ATOMIC VAEP added per 98', transform=fig.transFigure,
horizontalalignment='center', fontsize=12, fontfamily='SourceSansPro-SemiBold')
elif metric == 'PAx100':
plt.text(0.5, 0.86, 'Passes above expectations %', transform=fig.transFigure,
horizontalalignment='center', fontsize=12, fontfamily='SourceSansPro-SemiBold')
elif metric == 'passes_98':
plt.text(0.5, 0.86, 'Passes played per 98', transform=fig.transFigure,
horizontalalignment='center', fontsize=12, fontfamily='SourceSansPro-SemiBold')
plt.text(0.5, 0.83, f'Strikers | Minimum 500 minutes played | Passes from 10% of most valuable clusters',
transform=fig.transFigure, horizontalalignment='center', fontsize = 4, color = '#4E616C')
fig.suptitle(f'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com',
horizontalalignment='center', x = 0.5, y = 0.09, fontsize=3, color = "#000000")
#Saving and showing
ax.set_axis_off()
plt.savefig(f'TOP_FOR_CLUSTERS.png', dpi=500, facecolor = "#D7D1CF", bbox_inches = "tight", transparent = True)
plt.show()
In [ ]: