In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.colors as mcolors
import matplotlib.patheffects as path_effects
from matplotlib.cm import get_cmap
import seaborn as sns
from tqdm.auto import tqdm
from mplsoccer import Pitch, VerticalPitch, lines
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from PIL import Image
import urllib
import time
from joblib import Parallel, delayed, parallel_backend
import socceraction
import socceraction.atomic.spadl as atomicspadl
import socceraction.spadl as spadl
import multiprocessing
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import warnings

# Suppress unnecessary warnings
warnings.filterwarnings('ignore')
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
season = 2425
In [4]:
# Load light datasets from CSV files
VAEP = pd.read_csv("aVAEPactions.csv", index_col = 0)
xP = pd.read_csv("xPactions.csv", index_col = 0)
fotmob = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
positions = pd.read_csv("clustered_position.csv", index_col = 0)
In [5]:
# Load events datasets from CSV files
aactions = pd.read_csv(f"atomic_actions{season}.csv", index_col = 0)
actions = pd.read_csv(f"actions{season}.csv", index_col = 0)
In [6]:
# Load to be modified datasets from CSV files
players = pd.read_csv(f"players{season}.csv", index_col = 0)
players_info = players[['game_id', 'team_id', 'player_id', 'competition_id', 'season_id']]

recoveries = pd.read_csv(f"recoveries_id{season}.csv", index_col = 0)
recoveries.rename(columns = {'event_id':'original_event_id'}, inplace = True)
In [7]:
# Add descriptive action names to the atomic actions DataFrame
aactions = atomicspadl.add_names(aactions)
In [8]:
# Add descriptive action names to the actions DataFrame
actions = spadl.add_names(actions)
In [9]:
# Merge datasets to create a unified DataFrame
dfa = (
    aactions
    .merge(players_info, how="left")
    .merge(fotmob, how="left")
    .merge(VAEP, how="left"))
In [10]:
# Merge datasets to create a unified DataFrame
dfb = (
    actions
    .merge(players_info, how="left")
    .merge(fotmob, how="left")
    .merge(xP, how="left")
    .merge(positions, how="left"))
In [11]:
#Adding on necessary frame a set of features that we need to define progressive actions
dfb["beginning_distance"] = np.sqrt(np.square(105-dfb['start_x_a0']) + np.square(34-dfb['start_y_a0'])).round(2)
dfb["end_distance"] = np.sqrt(np.square(105-dfb['end_x_a0']) + np.square(34-dfb['end_y_a0'])).round(2)
dfb["length"] = dfb["end_distance"] - dfb["beginning_distance"]
dfb['length'] = dfb['length'].abs()
dfb["angle"] = np.arctan2(dfb["end_y_a0"] - dfb["start_y_a0"], dfb["end_x_a0"] - dfb["start_x_a0"])
dfb['angle_degrees'] = np.degrees(dfb['angle']) % 360
In [12]:
#Adding on previous actions infos to filter later on
dfb["prev_type_name"] = dfb.shift(+1, fill_value=0)["type_name"]
dfb["prev_result_name"] = dfb.shift(+1, fill_value=0)["result_name"]
In [13]:
#Flagging progressive actions given a custom defintion of passes not starting in the box, with lenght of more than 5m 
#not backwards or horizontal and the pass closes the distance from starting point to center of the goal by at least 17.5%
dfb['progressive'] = np.where(
    ((dfb['beginning_distance'] - dfb['end_distance']) / dfb['beginning_distance'] >= 0.175) & (dfb['length'] > 5) & 
    (((dfb['angle_degrees'] >= 0) & (dfb['angle_degrees'] <= 60)) | ((dfb['angle_degrees'] >= 260) & (dfb['angle_degrees'] <= 360))) &
    ~((dfb['start_x_a0'] >= 88.5) & (dfb['start_y_a0'] >= 13.885) & (dfb['start_y_a0'] <= 54.115)),
    True, False)
In [14]:
# Function to format season ID into a readable format
def format_season_id(season_id):
    # Convert to integer if it's a float
    season_id = int(season_id)
    # Extract the last two digits of the year
    start_year = str(season_id -1)[-2:]
    # Calculate the end year
    end_year = str(season_id)[-2:]
    # Format as 20/21
    formatted_season = f"{start_year}/{end_year}"
    return formatted_season
In [15]:
#Keeping only passes in both dataframe
df1a = dfa[dfa["type_name"] == 'pass']
df1b = dfb[dfb["type_name"] == 'pass']

#Creating an outcome column and then calculating completed (1) and not (0) passes minus probability (xP)
df1b['outcome'] = np.where((df1b["result_name"] == 'success'), True, False)
df1b['PAx'] = df1b['outcome'] - df1b['xP']
In [16]:
#Keeping a dataframe from each filtered dataframe for merging reasons later on
fa = df1a.filter(items=['game_id', 'original_event_id', 'team_id', 'vaep_value'])
fb = df1b.filter(items=['game_id', 'original_event_id', 'result_name', 'team_id', 'xP', 'PAx'])
In [17]:
#Reformatting original event id column in all dataframes which is needed for merging
fa['original_event_id'] = fa['original_event_id'].astype(int)
fb['original_event_id'] = fb['original_event_id'].astype(int)
df1a['original_event_id'] = df1a['original_event_id'].astype(int)
df1b['original_event_id'] = df1b['original_event_id'].astype(int)
In [18]:
#Actual merging operations to define our dataframe
df = df1b.merge(fa, how="left")
df = df[df['season_id'].notnull()]
In [19]:
# Apply the function to the 'season_id' column
df['formatted_season'] = df['season_id'].apply(format_season_id)
In [20]:
#Exploring unique positions to create a grouping function
df.position_group.unique()
Out[20]:
array(['CDM', 'AMW', 'WB', 'CB', 'ST', 'GK', 'Undefined'], dtype=object)
In [21]:
#Eliminating non footed passes, passes at the start of the half and after goals (to restart play)
df = df[df["bodypart_name"] == 'foot']
df = df[df["time_seconds"] != 0.0]
df = df[~((df["prev_type_name"] == 'shot') & (df["prev_result_name"] == 'success')) & (df["prev_result_name"] != 'owngoal')]
In [22]:
#Selecting player position and filtering the dataframe
dfx = df[df["position_group"] == 'AMW']
In [23]:
#Selecting features to consider in the clustering process
dfc = dfx.filter(items=['player_name', 'start_x_a0', 'start_y_a0', 'end_x_a0', 'end_y_a0', 'length', 'angle'])
X = dfc[['start_x_a0', 'start_y_a0', 'end_x_a0', 'end_y_a0', 'length', 'angle']].values
scaled_X = StandardScaler().fit_transform(X)
In [24]:
len(scaled_X)
Out[24]:
803913
In [25]:
print("Starting coherence-focused clustering for football pass data...")
print(f"Input data dimensions: {scaled_X.shape}")
start_total_time = time.time()

# Step 1: Create a representative sample for initial testing
print("\n--- STEP 1: Creating Representative Sample ---")
start_time = time.time()

# Take a larger sample to better represent the full range of pass types
sample_size = min(round(len(scaled_X)*0.5), 50000)  # Lower cap to focus on quality over quantity
print(f"Taking a representative sample of {sample_size} points for initial testing")

# Take a stratified sample if possible, otherwise random
if scaled_X.shape[0] > sample_size:
    sample_indices = np.random.choice(scaled_X.shape[0], size=sample_size, replace=False)
    X_sample = scaled_X[sample_indices]
else:
    X_sample = scaled_X
    sample_indices = np.arange(scaled_X.shape[0])

print(f"Sampling completed in {time.time() - start_time:.2f}s")
Starting coherence-focused clustering for football pass data...
Input data dimensions: (803913, 6)

--- STEP 1: Creating Representative Sample ---
Taking a representative sample of 50000 points for initial testing
Sampling completed in 0.01s
In [26]:
# Step 2: Find optimal number of clusters with focus on coherence
print("\n--- STEP 2: Finding Optimal Number of Clusters Focused on Coherence ---")
start_time = time.time()

# Define a wide range for football pass patterns - large range to find best coherence
min_clusters = 100   # Start higher to avoid under-clustering
max_clusters = 500   # Allow for many clusters to achieve high coherence
step_size = 2       # Larger initial steps to cover the range efficiently
cluster_range = list(range(min_clusters, max_clusters + 1, step_size))

print(f"Testing {len(cluster_range)} different cluster counts from {min_clusters} to {max_clusters}")

# Evaluation function prioritizing within-cluster coherence
def evaluate_kmeans(n_clusters, X):
    start = time.time()
    
    # For coherence, we need stricter parameters
    kmeans = MiniBatchKMeans(
        n_clusters=n_clusters,
        batch_size=1024,  # Smaller batch size for more precise clusters
        init='k-means++',
        max_iter=500,     # More iterations to ensure convergence
        n_init=10,        # More initializations to find cohesive clusters
        random_state=42,
        tol=1e-5          # Tighter tolerance for better convergence
    )
    
    # Fit model
    kmeans.fit(X)
    labels = kmeans.labels_
    
    # Calculate cluster stats
    sizes = np.bincount(labels)
    min_size = sizes.min()
    max_size = sizes.max()
    mean_size = sizes.mean()
    
    # Calculate size distribution stats - smaller clusters allowed for higher coherence
    size_std = np.std(sizes)
    size_cv = size_std / mean_size  # Coefficient of variation
    
    # Calculate inertia (key for coherence - lower means more cohesive clusters)
    inertia = kmeans.inertia_
    
    # Calculate normalized inertia (per cluster to account for different cluster counts)
    normalized_inertia = inertia / n_clusters
    
    # Calculate silhouette score (and others)
    sil_score = None
    ch_score = None
    db_score = None
    
    try:
        # For large datasets, calculate metrics on a subsample
        if X.shape[0] > 10000:
            subsample_size = min(10000, len(X)//10)
            subsample_indices = np.random.choice(X.shape[0], size=subsample_size, replace=False)
            sil_score = silhouette_score(X[subsample_indices], labels[subsample_indices])
            # Skip CH and DB - they tend to favor fewer clusters, which reduces coherence
        else:
            sil_score = silhouette_score(X, labels)
    except Exception as e:
        print(f"Error calculating scores for k={n_clusters}: {str(e)}")
    
    # Calculate a coherence-focused score (higher = better)
    coherence_score = None
    if sil_score is not None:
        # This formula explicitly prioritizes:
        # 1. Silhouette score (measures how similar points are to their own cluster vs other clusters)
        # 2. Lower normalized inertia (tighter clusters)
        # 3. Avoids extremely small clusters (tiny clusters may not be meaningful)
        
        # Normalize the inertia to 0-1 range (estimated based on typical values)
        norm_inertia_factor = 1.0 - min(1.0, normalized_inertia / 1000)
        
        # Check for too-small clusters (penalize if smallest cluster is too tiny)
        small_cluster_penalty = 0.0
        if min_size < 5:  # Penalize clusters with fewer than 5 points
            small_cluster_penalty = 0.2
        
        coherence_score = (
            0.7 * sil_score +              # Silhouette heavily weighted - direct measure of coherence
            0.3 * norm_inertia_factor -    # Lower inertia per cluster = more coherent
            small_cluster_penalty          # Penalty for extremely small clusters
        )
    
    elapsed = time.time() - start
    
    return {
        'n_clusters': n_clusters,
        'model': kmeans,
        'labels': labels,
        'min_cluster_size': min_size,
        'max_cluster_size': max_size,
        'mean_cluster_size': mean_size,
        'size_cv': size_cv,
        'silhouette': sil_score,
        'calinski_harabasz': ch_score,
        'davies_bouldin': db_score,
        'normalized_inertia': normalized_inertia,
        'coherence_score': coherence_score,
        'inertia': inertia,
        'runtime': elapsed
    }

# Parallelize the evaluation - use 10 CPUs as my pc has 12
n_cpus = multiprocessing.cpu_count()
n_jobs = max(1, min(n_cpus - 1, 10))
print(f"Using {n_jobs} CPU cores for parallel processing")

# Run evaluations in parallel
with parallel_backend('loky', n_jobs=n_jobs):
    results = Parallel(verbose=1)(
        delayed(evaluate_kmeans)(k, X_sample) for k in cluster_range
    )

# Create results dictionary
cluster_results = {r['n_clusters']: r for r in results}

print(f"\nInitial evaluation completed in {time.time() - start_time:.2f}s")
--- STEP 2: Finding Optimal Number of Clusters Focused on Coherence ---
Testing 201 different cluster counts from 100 to 500
Using 10 CPU cores for parallel processing
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    3.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   15.1s
Initial evaluation completed in 17.28s
[Parallel(n_jobs=10)]: Done 201 out of 201 | elapsed:   17.3s finished
In [27]:
# Step 3: Visualize evaluation metrics focused on coherence
print("\n--- STEP 3: Visualizing Coherence Metrics ---")
start_time = time.time()

# Extract results for plotting
cluster_counts = sorted(cluster_results.keys())
silhouette_scores = [cluster_results[k]['silhouette'] for k in cluster_counts]
inertia_values = [cluster_results[k]['inertia'] for k in cluster_counts]
coherence_scores = [cluster_results[k]['coherence_score'] if 'coherence_score' in cluster_results[k] else None for k in cluster_counts]
norm_inertia_values = [cluster_results[k]['normalized_inertia'] if 'normalized_inertia' in cluster_results[k] else None for k in cluster_counts]
min_sizes = [cluster_results[k]['min_cluster_size'] for k in cluster_counts]

# Create subplots focused on coherence metrics
fig, axs = plt.subplots(2, 2, figsize=(16, 14))

# Plot silhouette scores - higher values = more coherent clusters
axs[0, 0].plot(cluster_counts, silhouette_scores, 'o-', color='blue')
axs[0, 0].set_title('Silhouette Score vs. Number of Clusters')
axs[0, 0].set_xlabel('Number of Clusters')
axs[0, 0].set_ylabel('Silhouette Score (higher = more coherent)')
axs[0, 0].grid(True, linestyle='--', alpha=0.7)

# Highlight best silhouette score
if None not in silhouette_scores:
    best_silhouette_idx = np.argmax(silhouette_scores)
    best_k_silhouette = cluster_counts[best_silhouette_idx]
    best_silhouette = silhouette_scores[best_silhouette_idx]
    axs[0, 0].axvline(x=best_k_silhouette, color='red', linestyle='--', alpha=0.7)
    axs[0, 0].plot(best_k_silhouette, best_silhouette, 'ro', ms=10)
    axs[0, 0].annotate(f'Best: k={best_k_silhouette}, score={best_silhouette:.3f}',
                     xy=(best_k_silhouette, best_silhouette),
                     xytext=(best_k_silhouette+10, best_silhouette),
                     arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))

# Plot normalized inertia (per cluster) - lower values = tighter, more coherent clusters
if None not in norm_inertia_values:
    axs[0, 1].plot(cluster_counts, norm_inertia_values, 'o-', color='green')
    axs[0, 1].set_title('Normalized Inertia vs. Number of Clusters')
    axs[0, 1].set_xlabel('Number of Clusters')
    axs[0, 1].set_ylabel('Inertia per Cluster (lower = more coherent)')
    axs[0, 1].grid(True, linestyle='--', alpha=0.7)
    
    # Highlight best normalized inertia (lowest value)
    best_norm_inertia_idx = np.argmin(norm_inertia_values)
    best_k_norm_inertia = cluster_counts[best_norm_inertia_idx]
    best_norm_inertia = norm_inertia_values[best_norm_inertia_idx]
    axs[0, 1].axvline(x=best_k_norm_inertia, color='red', linestyle='--', alpha=0.7)
    axs[0, 1].plot(best_k_norm_inertia, best_norm_inertia, 'ro', ms=10)
    axs[0, 1].annotate(f'Best: k={best_k_norm_inertia}',
                     xy=(best_k_norm_inertia, best_norm_inertia),
                     xytext=(best_k_norm_inertia+10, best_norm_inertia),
                     arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))

# Plot coherence score (combined metric optimized for coherence)
if None not in coherence_scores:
    axs[1, 0].plot(cluster_counts, coherence_scores, 'o-', color='purple')
    axs[1, 0].set_title('Coherence Score vs. Number of Clusters')
    axs[1, 0].set_xlabel('Number of Clusters')
    axs[1, 0].set_ylabel('Coherence Score (higher = better)')
    axs[1, 0].grid(True, linestyle='--', alpha=0.7)
    
    # Highlight best coherence score
    best_coherence_idx = np.argmax(coherence_scores)
    best_k_coherence = cluster_counts[best_coherence_idx]
    best_coherence = coherence_scores[best_coherence_idx]
    axs[1, 0].axvline(x=best_k_coherence, color='red', linestyle='--', alpha=0.7)
    axs[1, 0].plot(best_k_coherence, best_coherence, 'ro', ms=10)
    axs[1, 0].annotate(f'Best: k={best_k_coherence}, score={best_coherence:.3f}',
                     xy=(best_k_coherence, best_coherence),
                     xytext=(best_k_coherence+10, best_coherence),
                     arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))

# Plot smallest cluster size - important for coherence evaluation
axs[1, 1].plot(cluster_counts, min_sizes, 'o-', color='orange')
axs[1, 1].set_title('Smallest Cluster Size vs. Number of Clusters')
axs[1, 1].set_xlabel('Number of Clusters')
axs[1, 1].set_ylabel('Points in Smallest Cluster')
axs[1, 1].grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

print(f"Visualization completed in {time.time() - start_time:.2f}s")
--- STEP 3: Visualizing Coherence Metrics ---
No description has been provided for this image
Visualization completed in 0.41s
In [28]:
# Step 4: Refined search for maximum coherence
print("\n--- STEP 4: Refined Search for Maximum Coherence ---")
start_time = time.time()

# Identify the best clusters based on coherence metrics
best_k_values = []

# Silhouette score is an excellent direct measure of coherence
if None not in silhouette_scores:
    # Get the top 3 silhouette scores
    sil_indices = np.argsort(silhouette_scores)[-3:]
    for idx in sil_indices:
        best_k_values.append(cluster_counts[idx])

# Normalized inertia is another good indicator of coherence
if None not in norm_inertia_values:
    # Get the top 3 normalized inertia values (lowest)
    norm_inertia_indices = np.argsort(norm_inertia_values)[:3]
    for idx in norm_inertia_indices:
        best_k_values.append(cluster_counts[idx])

# Add the best coherence score if available
if None not in coherence_scores:
    best_k_values.append(cluster_counts[best_coherence_idx])

# For coherence, median is a good approach (not biasing toward either extreme)
optimal_k_approx = int(np.median(best_k_values))

print(f"Candidate optimal cluster counts based on coherence: {best_k_values}")
print(f"Median optimal cluster count: {optimal_k_approx}")

# For coherence, we need a narrower search to really find the sweet spot
fine_min = max(50, optimal_k_approx - 15)
fine_max = min(500, optimal_k_approx + 15)
fine_step = 1  # Single-step precision for finding the most coherent clustering
fine_cluster_range = list(range(fine_min, fine_max + 1, fine_step))

print(f"Performing fine-grained coherence search around k={optimal_k_approx}")
print(f"Testing cluster counts: {fine_min} to {fine_max} in steps of {fine_step}")

# Run fine-grained evaluations in parallel
with parallel_backend('loky', n_jobs=n_jobs):
    fine_results = Parallel(verbose=1)(
        delayed(evaluate_kmeans)(k, X_sample) for k in fine_cluster_range if k not in cluster_results
    )

# Add new results to the dictionary
for r in fine_results:
    cluster_results[r['n_clusters']] = r

# Update plotting data for coherence metrics
cluster_counts = sorted(cluster_results.keys())
silhouette_scores = [cluster_results[k]['silhouette'] for k in cluster_counts]
coherence_scores = [cluster_results[k].get('coherence_score', None) for k in cluster_counts]

# Determine the optimal k by prioritizing coherence score, then silhouette
if None not in coherence_scores:
    best_coherence_idx = np.argmax(coherence_scores)
    optimal_k = cluster_counts[best_coherence_idx]
    optimal_score = coherence_scores[best_coherence_idx]
    print(f"Using coherence score to determine optimal k={optimal_k}, score={optimal_score:.3f}")
else:
    # Fall back to silhouette if coherence score is unavailable
    best_silhouette_idx = np.argmax(silhouette_scores)
    optimal_k = cluster_counts[best_silhouette_idx]
    optimal_score = silhouette_scores[best_silhouette_idx]
    print(f"Using silhouette score to determine optimal k={optimal_k}, score={optimal_score:.3f}")

print(f"\nFine-grained search completed in {time.time() - start_time:.2f}s")
print(f"Optimal number of clusters for maximum coherence: k={optimal_k}")

if cluster_results[optimal_k]['silhouette'] is not None:
    print(f"Silhouette score at k={optimal_k}: {cluster_results[optimal_k]['silhouette']:.3f}")
if 'coherence_score' in cluster_results[optimal_k] and cluster_results[optimal_k]['coherence_score'] is not None:
    print(f"Coherence score at k={optimal_k}: {cluster_results[optimal_k]['coherence_score']:.3f}")
--- STEP 4: Refined Search for Maximum Coherence ---
Candidate optimal cluster counts based on coherence: [108, 114, 100, 498, 496, 500, 326]
Median optimal cluster count: 326
Performing fine-grained coherence search around k=326
Testing cluster counts: 311 to 341 in steps of 1
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
Using coherence score to determine optimal k=326, score=0.405

Fine-grained search completed in 1.75s
Optimal number of clusters for maximum coherence: k=326
Silhouette score at k=326: 0.169
Coherence score at k=326: 0.405
[Parallel(n_jobs=10)]: Done  14 out of  16 | elapsed:    1.6s remaining:    0.2s
[Parallel(n_jobs=10)]: Done  16 out of  16 | elapsed:    1.6s finished
In [29]:
# Step 5: Visualize final results
print("\n--- STEP 5: Visualizing Final Results ---")
start_time = time.time()

# Create updated plots with fine-grained results
fig, axs = plt.subplots(2, 2, figsize=(15, 12))

# Make sure we're using the correct variables (coherence_scores instead of custom_scores)
# Sort data by cluster count for proper plotting
sorted_indices = np.argsort(cluster_counts)
sorted_cluster_counts = np.array(cluster_counts)[sorted_indices]
sorted_silhouette = np.array(silhouette_scores)[sorted_indices]

# Plot silhouette scores with zoomed region
axs[0, 0].plot(sorted_cluster_counts, sorted_silhouette, 'o-', color='blue', alpha=0.4)
axs[0, 0].set_title('Silhouette Score vs. Number of Clusters')
axs[0, 0].set_xlabel('Number of Clusters')
axs[0, 0].set_ylabel('Silhouette Score (higher = more coherent)')
axs[0, 0].grid(True, linestyle='--', alpha=0.7)
axs[0, 0].axvline(x=optimal_k, color='red', linestyle='--', alpha=0.7)

# Zoom in on the region around optimal_k for silhouette
zoom_min = max(0, np.where(sorted_cluster_counts >= optimal_k - 50)[0][0])
zoom_max = min(len(sorted_cluster_counts) - 1, np.where(sorted_cluster_counts <= optimal_k + 50)[0][-1])
zoom_x = sorted_cluster_counts[zoom_min:zoom_max+1]
zoom_y = sorted_silhouette[zoom_min:zoom_max+1]
axs[0, 1].plot(zoom_x, zoom_y, 'o-', color='blue')
axs[0, 1].set_title(f'Silhouette Score (Zoomed on k={optimal_k})')
axs[0, 1].set_xlabel('Number of Clusters')
axs[0, 1].set_ylabel('Silhouette Score')
axs[0, 1].grid(True, linestyle='--', alpha=0.7)
axs[0, 1].axvline(x=optimal_k, color='red', linestyle='--', alpha=0.7)
axs[0, 1].plot(optimal_k, cluster_results[optimal_k]['silhouette'], 'ro', ms=10)

# Plot coherence scores if available
valid_coherence_scores = [score for score in coherence_scores if score is not None]
if len(valid_coherence_scores) > 0:
    # Filter out None values and get corresponding cluster counts
    valid_indices = [i for i, score in enumerate(coherence_scores) if score is not None]
    valid_counts = [cluster_counts[i] for i in valid_indices]
    sorted_c_indices = np.argsort(valid_counts)
    
    # Sort coherence scores by cluster count
    sorted_c_counts = np.array(valid_counts)[sorted_c_indices]
    sorted_coherence = np.array(valid_coherence_scores)[sorted_c_indices]
    
    # Plot full range
    axs[1, 0].plot(sorted_c_counts, sorted_coherence, 'o-', color='magenta', alpha=0.4)
    axs[1, 0].set_title('Coherence Score vs. Number of Clusters')
    axs[1, 0].set_xlabel('Number of Clusters')
    axs[1, 0].set_ylabel('Coherence Score (higher = better)')
    axs[1, 0].grid(True, linestyle='--', alpha=0.7)
    axs[1, 0].axvline(x=optimal_k, color='red', linestyle='--', alpha=0.7)
    
    # Try to plot zoomed view if we have enough data points around optimal_k
    if 'coherence_score' in cluster_results[optimal_k] and cluster_results[optimal_k]['coherence_score'] is not None:
        # Find coherence scores near the optimal_k
        zoom_coherence_counts = [c for c in sorted_c_counts if optimal_k-50 <= c <= optimal_k+50]
        if len(zoom_coherence_counts) > 0:
            zoom_coherence_indices = [i for i, c in enumerate(sorted_c_counts) if c in zoom_coherence_counts]
            zoom_c_x = sorted_c_counts[zoom_coherence_indices]
            zoom_c_y = sorted_coherence[zoom_coherence_indices]
            
            axs[1, 1].plot(zoom_c_x, zoom_c_y, 'o-', color='magenta')
            axs[1, 1].set_title(f'Coherence Score (Zoomed on k={optimal_k})')
            axs[1, 1].set_xlabel('Number of Clusters')
            axs[1, 1].set_ylabel('Coherence Score')
            axs[1, 1].grid(True, linestyle='--', alpha=0.7)
            axs[1, 1].axvline(x=optimal_k, color='red', linestyle='--', alpha=0.7)
            axs[1, 1].plot(optimal_k, cluster_results[optimal_k]['coherence_score'], 'ro', ms=10)
else:
    print("No valid coherence scores available for plotting")

plt.tight_layout()
plt.show()

print(f"Visualization completed in {time.time() - start_time:.2f}s")
--- STEP 5: Visualizing Final Results ---
No description has been provided for this image
Visualization completed in 0.21s
In [30]:
# Step 6: Apply best model to full dataset
print(f"\n--- STEP 6: Clustering Full Dataset with k={optimal_k} ---")
start_time = time.time()

print(f"Applying MiniBatchKMeans with {optimal_k} clusters to the full football pass dataset...")

# Create final model with optimal number of clusters
final_kmeans = MiniBatchKMeans(
    n_clusters=optimal_k,
    batch_size=min(4096, len(scaled_X)//20),  # Adjust batch size based on dataset size
    init='k-means++',
    max_iter=300,
    n_init=5,
    random_state=42
)

# Process very large datasets in batches
batch_size = 50000
if scaled_X.shape[0] > 1000000:  # For extremely large datasets
    print("Using partial_fit for extremely large dataset...")
    # Initialize centroids with a sample
    final_kmeans.partial_fit(X_sample)
    
    # Process remaining data in batches
    for i in tqdm(range(0, scaled_X.shape[0], batch_size)):
        end = min(i + batch_size, scaled_X.shape[0])
        if i not in sample_indices:  # Skip points already used for initialization
            final_kmeans.partial_fit(scaled_X[i:end])
    
    # Get labels for all points
    print("Predicting cluster labels for all points...")
    labels = np.zeros(scaled_X.shape[0], dtype=int)
    for i in tqdm(range(0, scaled_X.shape[0], batch_size)):
        end = min(i + batch_size, scaled_X.shape[0])
        labels[i:end] = final_kmeans.predict(scaled_X[i:end])
else:
    # For moderately large datasets, fit directly
    print("Fitting model on full dataset...")
    final_kmeans.fit(scaled_X)
    labels = final_kmeans.labels_

print(f"Full dataset clustering completed in {time.time() - start_time:.2f}s")
--- STEP 6: Clustering Full Dataset with k=326 ---
Applying MiniBatchKMeans with 326 clusters to the full football pass dataset...
Fitting model on full dataset...
Full dataset clustering completed in 1.86s
In [31]:
# Step 7: Analyze and interpret football pass clusters
print("\n--- STEP 7: Analyzing Football Pass Cluster Results ---")

# Calculate cluster sizes
cluster_sizes = np.bincount(labels)
sorted_indices = np.argsort(cluster_sizes)[::-1]  # Sort by size (descending)

print(f"Number of football pass clusters: {optimal_k}")
print(f"Largest cluster: {cluster_sizes.max()} passes ({cluster_sizes.max()/len(labels)*100:.2f}%)")
print(f"Smallest cluster: {cluster_sizes.min()} passes ({cluster_sizes.min()/len(labels)*100:.2f}%)")
print(f"Average cluster size: {cluster_sizes.mean():.1f} passes")
print(f"Median cluster size: {np.median(cluster_sizes):.1f} passes")

# Display info about top clusters
print("\nLargest football pass clusters:")
for i, idx in enumerate(sorted_indices[:10]):
    size = cluster_sizes[idx]
    percentage = size / len(labels) * 100
    print(f"Cluster {idx}: {size} passes ({percentage:.2f}%)")

if len(sorted_indices) > 10:
    print(f"... and {len(sorted_indices)-10} more pass clusters")

# Visualize cluster size distribution
plt.figure(figsize=(12, 6))
plt.bar(range(len(cluster_sizes)), cluster_sizes[sorted_indices], alpha=0.7)
plt.title(f'Football Pass Cluster Size Distribution (k={optimal_k})')
plt.xlabel('Cluster Rank (by size)')
plt.ylabel('Number of Passes')
plt.grid(True, linestyle='--', alpha=0.5, axis='y')
plt.tight_layout()
plt.show()

# Calculate total runtime
total_runtime = time.time() - start_total_time
print(f"\nTotal football pass clustering pipeline runtime: {total_runtime:.2f} seconds")

# Assign cluster labels to original dataframe
dfx['cluster'] = labels

print("\nFootball pass clustering complete!")
print(f"Cluster labels have been added to 'dfx' dataframe as 'cluster' column")
print(f"Optimal number of clusters for football passing analysis: {optimal_k}")
--- STEP 7: Analyzing Football Pass Cluster Results ---
Number of football pass clusters: 326
Largest cluster: 7208 passes (0.90%)
Smallest cluster: 327 passes (0.04%)
Average cluster size: 2466.0 passes
Median cluster size: 2362.0 passes

Largest football pass clusters:
Cluster 0: 7208 passes (0.90%)
Cluster 58: 6792 passes (0.84%)
Cluster 14: 6288 passes (0.78%)
Cluster 67: 6067 passes (0.75%)
Cluster 4: 5964 passes (0.74%)
Cluster 209: 5682 passes (0.71%)
Cluster 241: 5636 passes (0.70%)
Cluster 202: 5517 passes (0.69%)
Cluster 217: 5440 passes (0.68%)
Cluster 200: 5427 passes (0.68%)
... and 316 more pass clusters
No description has been provided for this image
Total football pass clustering pipeline runtime: 21.71 seconds

Football pass clustering complete!
Cluster labels have been added to 'dfx' dataframe as 'cluster' column
Optimal number of clusters for football passing analysis: 326
In [32]:
#Finding passes 5th and 95th percentile to assign cmap in visualization
x_min = dfx.vaep_value.quantile(0.05)
x_max = dfx.vaep_value.quantile(0.95)
In [33]:
#Get the whole list of players in the data
playerlist = dfx['player_name'].unique().tolist()
cleaned_playerlist = [name for name in playerlist if pd.notna(name)]
cleaned_playerlist.sort()
In [34]:
from IPython.display import display, HTML

# Generate the HTML dropdown to easily search for players
options_html = ''.join([f'<option value="{name}">{name}</option>' for name in cleaned_playerlist])

dropdown_html = f"""
<input list="players" id="dropdown" oninput="handleInput()" placeholder="Choose Someone">
<datalist id="players">
    {options_html}
</datalist>
<p id="output"></p>
<script>
function handleInput() {{
    var input = document.getElementById("dropdown").value;
    var output = document.getElementById("output");
    output.innerHTML = "Selected: " + input;
}}
</script>
"""

# Display the dropdown
display(HTML(dropdown_html))

In [35]:
#Selecting player and filtering the dataframe
dfp = dfx[dfx["player_name"] == 'Rayan Cherki']
In [36]:
#Creating the def to plot 4 different pitches and selected passes into them
def plot_passes(ax, cluster_name):

    plot_df = dfx[(dfx['cluster'] == cluster_name)]

    pitch = Pitch(
            pitch_type='custom',
            pitch_width=68,
            pitch_length=105,
            goal_type='box',
            linewidth=2,
            line_color='black',
            half=False)
    pitch.draw(ax = ax)

    cmap = matplotlib.colormaps.get_cmap('afmhot_r')
    vmin = x_min  # or set manually, e.g., vmin = 0
    vmax = x_max  # or set manually, e.g., vmax = 1
    # Normalize the data to the specified range
    norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
    # Apply the colormap and normalization to your data
    colors = cmap(norm(dfx['vaep_value']))
    
    pitch.arrows(plot_df.start_x_a0, plot_df.start_y_a0, plot_df.end_x_a0, plot_df.end_y_a0, width=2.5, alpha=0.8,
             headwidth=10, headlength=8, color=colors, label='succcessful passes', ax=ax)
    
    ax.annotate(
        xy=(50,72.5),
        text=f"Passes: {order['attempted_passes'].iloc[index]}  | Expected - Actual Completion % (PAx100) : {(order['PAx100'].iloc[index]).round(2)}",
        size=20,
        color='black',
        ha='center',
        va='center',
        weight='bold',
        annotation_clip=False
    )

    # Annotation with data coordinates and offset points
    ax.annotate(
        text="", 
        xy=(65, 69.5),  # Target point on the axes
        xytext=(-200, 0),  # Offset of the text relative to the target point
        textcoords="offset points", 
        size=27, 
        color="#000000", 
        arrowprops=dict(arrowstyle="-|>", shrinkA=0, color="black", linewidth=2)
    )

    # Create and customize the colorbar
    cbar_ax = fig.add_axes([0.9, 0.25, 0.02, 0.5])  # Adjust position and size of colorbar
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])  # Set the array for the ScalarMappable
    cbar = plt.colorbar(sm, cax=cbar_ax)

    # Add title to the colorbar
    cbar.ax.set_title('attempted_passes', fontsize=20, pad=20, rotation=0, loc='center')

    # Adjust font size of the colorbar tick labels
    cbar.ax.tick_params(labelsize=15)

    return ax
In [37]:
# Grouping the DataFrame by 'cluster' and number of successful and not passes
order_teams0 = (dfx.groupby(["cluster"], observed=True)['outcome'].sum().reset_index(name='successful_passes'))

# Grouping the DataFrame by 'cluster' and total number of passes
order_teams1 = (dfx.groupby(["cluster"], observed=True)['outcome'].count().reset_index(name='attempted_passes'))

# Grouping the DataFrame by 'cluster' and total xP
order_teams2 = (dfx.groupby(["cluster"], observed=True)['xP'].sum().reset_index())

# Grouping the DataFrame by 'cluster' and median vaep value inside the subset
order_teams3 = (dfx.groupby(["cluster"], observed=True)['vaep_value'].median().reset_index())


# Merging the two grouped DataFrames on 'cluster'
order = (order_teams0
                .merge(order_teams1, left_on='cluster', right_on='cluster')
                .merge(order_teams2, left_on='cluster', right_on='cluster')
                .merge(order_teams3, left_on='cluster', right_on='cluster'))

# Calculating the success %, expected % and the difference for each cluster
order['success_pct'] = ((order['successful_passes'] * 100) / order['attempted_passes'])
order['xP_pct'] = (order['xP'] / order['attempted_passes']) * 100
order['PAx100'] = (((order['successful_passes'] - order['xP']) / order['attempted_passes']) * 100)

# Sorting the DataFrame in descending order and resetting the index
order = (order.sort_values(by=['vaep_value'], ascending=False).reset_index(drop=True))

order
Out[37]:
cluster successful_passes attempted_passes xP vaep_value success_pct xP_pct PAx100
0 236 1946 2590 1813.219668 0.018239 75.135135 70.008481 5.126654
1 94 1787 2416 1828.837343 0.015238 73.965232 75.696910 -1.731678
2 10 2312 3250 2464.107256 0.014835 71.138462 75.818685 -4.680223
3 237 1585 2682 2068.813586 0.014308 59.097688 77.136972 -18.039284
4 113 2834 3642 2584.631934 0.014041 77.814388 70.967379 6.847009
... ... ... ... ... ... ... ... ...
321 180 2239 2366 1829.358303 -0.009789 94.632291 77.318610 17.313681
322 321 1904 2054 1567.126787 -0.010228 92.697176 76.296338 16.400838
323 308 1994 2140 1610.955580 -0.010957 93.177570 75.278298 17.899272
324 13 22 501 231.886987 -0.015254 4.391218 46.284828 -41.893610
325 105 37 587 284.406141 -0.015378 6.303237 48.450791 -42.147554

326 rows × 8 columns

In [38]:
#Creating fig, axes and launch the def
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(40,25), dpi=200)
axs = np.array(axs)

for index, ax in enumerate(axs.reshape(-1)):
    plot_passes(ax, order['cluster'].iloc[index])

# Adjust the space between subplots
plt.subplots_adjust(wspace=0.001, hspace=0.0001)

# Adding some text to better understand what the plot represents
plt.text(0.5, 0.93, f'Top 4 Most Performed Passes Clusters by {', '.join(dfx['position_group'].unique())} in {season}', 
         transform=fig.transFigure, horizontalalignment='center', fontsize=45)

plt.text(0.5, 0.91, f"Ordered by pass frequency", 
         transform=fig.transFigure, horizontalalignment='center', fontsize=25)

fig.suptitle(f'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com', 
             x=0.5, y=0.097, fontsize=20, verticalalignment='bottom')
Out[38]:
Text(0.5, 0.097, 'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com')
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [39]:
#Creating the def to plot 4 different pitches and selected passes into them for specific player
def plot_passes_player(ax, cluster_name):

    plot_df = dfp[(dfp['cluster'] == cluster_name)]

    pitch = Pitch(
            pitch_type='custom',
            pitch_width=68,
            pitch_length=105,
            goal_type='box',
            linewidth=2,
            line_color='black',
            half=False)
    pitch.draw(ax = ax)

    cmap = matplotlib.colormaps.get_cmap('afmhot_r')
    vmin = x_min  # or set manually, e.g., vmin = 0
    vmax = x_max  # or set manually, e.g., vmax = 1
    # Normalize the data to the specified range
    norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
    # Apply the colormap and normalization to your data
    colors = cmap(norm(dfx['vaep_value']))
    
    pitch.arrows(plot_df.start_x_a0, plot_df.start_y_a0, plot_df.end_x_a0, plot_df.end_y_a0, width=2.5, alpha=0.8,
             headwidth=10, headlength=8, color=colors, label='succcessful passes', ax=ax)
    
    ax.annotate(
        xy=(50,72.5),
        text=f"Passes: {order_player['attempted_passes'].iloc[index]}  | Expected - Actual Completion % (PAx100) : {(order_player['PAx100'].iloc[index]).round(2)}",
        size=20,
        color='black',
        ha='center',
        va='center',
        weight='bold',
        annotation_clip=False
    )

    # Annotation with data coordinates and offset points
    ax.annotate(
        text="", 
        xy=(65, 69.5),  # Target point on the axes
        xytext=(-200, 0),  # Offset of the text relative to the target point
        textcoords="offset points", 
        size=27, 
        color="#000000", 
        arrowprops=dict(arrowstyle="-|>", shrinkA=0, color="black", linewidth=2)
    )

    # Create and customize the colorbar
    cbar_ax = fig.add_axes([0.9, 0.25, 0.02, 0.5])  # Adjust position and size of colorbar
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])  # Set the array for the ScalarMappable
    cbar = plt.colorbar(sm, cax=cbar_ax)

    # Add title to the colorbar
    cbar.ax.set_title('ATOMIC-VAEP', fontsize=20, pad=20, rotation=0, loc='center')

    # Adjust font size of the colorbar tick labels
    cbar.ax.tick_params(labelsize=15)

    return ax
In [40]:
# Grouping the DataFrame by 'cluster' and number of successful and not passes
order_player0 = (dfp.groupby(["cluster"], observed=True)['outcome'].sum().reset_index(name='successful_passes'))

# Grouping the DataFrame by 'cluster' and total number of passes
order_player1 = (dfp.groupby(["cluster"], observed=True)['outcome'].count().reset_index(name='attempted_passes'))

# Grouping the DataFrame by 'cluster' and total xP
order_player2 = (dfp.groupby(["cluster"], observed=True)['xP'].sum().reset_index())

# Grouping the DataFrame by 'cluster' and median vaep value inside the subset
order_player3 = (dfp.groupby(["cluster"], observed=True)['vaep_value'].median().reset_index())


# Merging the two grouped DataFrames on 'cluster'
order_player = (order_player0
                .merge(order_player1, left_on='cluster', right_on='cluster')
                .merge(order_player2, left_on='cluster', right_on='cluster')
                .merge(order_player3, left_on='cluster', right_on='cluster'))

# Calculating the success %, expected % and difference for each cluster
order_player['success_pct'] = ((order_player['successful_passes'] * 100) / order_player['attempted_passes'])
order_player['xP_pct'] = (order_player['xP'] / order_player['attempted_passes']) * 100
order_player['PAx100'] = (((order_player['successful_passes'] - order_player['xP']) / order_player['attempted_passes']) * 100)

# Sorting the DataFrame in descending order and resetting the index
order_player = (order_player.sort_values(by=['attempted_passes'], ascending=False).reset_index(drop=True))

order_player
Out[40]:
cluster successful_passes attempted_passes xP vaep_value success_pct xP_pct PAx100
0 237 17 22 16.915477 0.013659 77.272727 76.888530 0.384197
1 10 16 20 15.702311 0.012405 80.000000 78.511555 1.488445
2 113 13 18 12.316653 0.009773 72.222222 68.425849 3.796373
3 29 18 18 15.090296 -0.006214 100.000000 83.834977 16.165023
4 58 18 18 15.563379 -0.002910 100.000000 86.463219 13.536781
... ... ... ... ... ... ... ... ...
291 25 1 1 0.886431 -0.002829 100.000000 88.643140 11.356860
292 24 1 1 0.750909 -0.001206 100.000000 75.090945 24.909055
293 276 0 1 0.818571 0.000842 0.000000 81.857090 -81.857090
294 277 1 1 0.882073 -0.000296 100.000000 88.207290 11.792710
295 250 1 1 0.859577 0.002061 100.000000 85.957740 14.042260

296 rows × 8 columns

In [41]:
#Creating fig, axes and launching the def
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(40,25), dpi=200)
axs = np.array(axs)

for index, ax in enumerate(axs.reshape(-1)):
    plot_passes_player(ax, order_player['cluster'].iloc[index])

# Adjust the space between subplots
plt.subplots_adjust(wspace=0.001, hspace=0.0001)

# Adding text to the plot
plt.text(0.5, 0.93, f'{', '.join(dfp['player_name'].unique())} - Top 4 Most Performed Passing Patterns for', 
         transform=fig.transFigure, horizontalalignment='center', fontsize=45)

plt.text(0.5, 0.91, f"Ordered by pass frequency | {', '.join(dfp['competition_id'].unique())} {', '.join(dfp['formatted_season'].unique())}", 
         transform=fig.transFigure, horizontalalignment='center', fontsize=25)

fig.suptitle(f'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com', 
             x=0.5, y=0.097, fontsize=20, verticalalignment='bottom')

#Adding the logo of the team
fotmob_url = "https://images.fotmob.com/image_resources/logo/teamlogo/"
logo_ax = fig.add_axes([.2, 0.9, 0.08, 0.08], zorder=1)
club_icon = Image.open(urllib.request.urlopen(f"{fotmob_url}{dfp['fotmob_id'].iloc[0]}.png"))
logo_ax.imshow(club_icon)
logo_ax.axis("off")

# Save the figure with adjusted face color and transparency
plt.savefig(f'{', '.join(dfp['player_name'].unique())}-passingclusters-{season}.png',
            dpi=500, facecolor="#D7D1CF", bbox_inches="tight", transparent=True)
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [42]:
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to minutes (seconds converted to fraction of minutes)
        return minutes + seconds / 60
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)
In [43]:
#Merge player dataframe with team infos dataframe
players0 = players.merge(fotmob)
In [44]:
#Creating a table with minutes played in season by players
mp = players0.groupby(["player_name", "team_name"])["minutes_played_converted"].sum().reset_index(name='minutes_played')
In [45]:
# Sorting the DataFrame by atomic vaep value in descending order to keep 10% most valuable
order = (order.sort_values(by=['vaep_value'], ascending=False).reset_index(drop=True))

#Setting the list of those 10% clusters
most_valuable = order.head(int(optimal_k/10)).cluster.unique()
In [46]:
#Keeping the 10% of most valuable clusters
A = dfx[dfx['cluster'].isin(most_valuable)]
In [47]:
# Grouping the DataFrame by 'cluster' and number of successful and not passes
A0 = (A.groupby(["player_name", "team_name"], observed=True)['outcome'].sum().reset_index(name='successful_passes'))

# Grouping the DataFrame by 'cluster' and total number of passes
A1 = (A.groupby(["player_name", "team_name"], observed=True)['outcome'].count().reset_index(name='attempted_passes'))

# Grouping the DataFrame by 'cluster' and total xP
A2 = (A.groupby(["player_name", "team_name"], observed=True)['xP'].sum().reset_index())

# Grouping the DataFrame by 'cluster' and sum of vaep
A3 = (A.groupby(["player_name", "team_name"], observed=True)['vaep_value'].sum().reset_index())

# Merging the two grouped DataFrames on 'cluster'
AA = (A0
      .merge(A1, left_on=["player_name", "team_name"], right_on=["player_name", "team_name"])
      .merge(A2, left_on=["player_name", "team_name"], right_on=["player_name", "team_name"])
      .merge(A3, left_on=["player_name", "team_name"], right_on=["player_name", "team_name"]))

# Calculating the success percentage for each cluster
AA['success_pct'] = ((AA['successful_passes'] * 100) / AA['attempted_passes'])
AA['xP_pct'] = (AA['xP'] / AA['attempted_passes']) * 100
AA['PAx100'] = (((AA['successful_passes'] - AA['xP']) / AA['attempted_passes']) * 100).round(2)
In [48]:
minutesadj = players0.groupby(["game_id", "game_duration"], observed=True)['is_starter'].count().reset_index(name='is_starter')

# Apply the conversion function to the 'minutes_played' column
minutesadj['game_duration_converted'] = minutesadj['game_duration'].apply(convert_to_minutes)

minutesadj = minutesadj.game_duration_converted.median()
minutesadj
Out[48]:
98.43333333333334
In [49]:
#Merging with minutes played for players and calculating the final metrics we want
AA0 = AA.merge(mp, how='left')
AA0["passes_98"] = ((AA0.attempted_passes * minutesadj) / AA0.minutes_played).round(2)
AA0["vaep_value_98"] = ((AA0.vaep_value * minutesadj) / AA0.minutes_played).round(3)
In [50]:
#Keeping players with at least 500 minutes
AA1 = AA0[AA0['minutes_played'] >= 500]

#Filter the columns we want to keep
AA2 = AA1[['player_name', 'team_name', 'PAx100', 'passes_98', 'vaep_value_98']]
In [51]:
#Selecting the metric we want to analyze and visualize
metric = 'vaep_value_98'
In [52]:
# Selecting top 10
AAFa = (AA2.sort_values(by=[metric], ascending=False).reset_index(drop=True)).head(10)

#Resorting since the visualization works inverted
AAF = (AAFa.sort_values(by=[metric], ascending=True).reset_index(drop=True))

AAF
Out[52]:
player_name team_name PAx100 passes_98 vaep_value_98
0 Abdellah Zoubir Qarabag FK -4.44 7.27 0.101
1 Michael Olise Bayern -5.58 8.43 0.103
2 Lazar Samardzic Atalanta -6.06 8.17 0.103
3 Raheem Sterling Arsenal -14.91 5.53 0.104
4 Nicolas Kühn Celtic -13.72 7.35 0.110
5 Rayan Cherki Lyon -5.21 9.82 0.115
6 Martin Ødegaard Arsenal -8.99 8.90 0.117
7 Luciano Acosta FC Cincinnati -8.22 11.07 0.120
8 Lionel Messi Inter Miami CF -13.39 12.42 0.142
9 Luke McCowan Celtic -2.05 8.65 0.174
In [53]:
#Setting the figure, the axes and the dimension of the figure to make it all fit pleasingly
fig = plt.figure(figsize=(1800/500, 1800/500), dpi=500)
ax = plt.subplot()

ncols = AAF.shape[1]
nrows = AAF.shape[0]

ax.set_xlim(0, ncols + 1)
ax.set_ylim(0, nrows + 1)

positions = [0.1, 3.4, 4.8]
columns = ['player_name', 'team_name', metric]

#Conditioning for names in different columns
for i in range(nrows):
    for j, column in enumerate(columns):
        if j == 0:
            ha = 'left'
        else:
            ha = 'center'
        if column == metric:
            fontsize = 10
            color = '#FFFFFF'
            fontname = fe_semibold.name
        elif column == 'team_name':
            fontsize = 4  
            color = '#4E616C' 
            fontname = fe_regular.name
        else:
            fontsize = 11
            color = '#000000' 
            fontname = fe_semibold.name
        ax.annotate(
            xy=(positions[j], i + .5), text=str(AAF[column].iloc[i]), ha=ha, va='center', fontsize=fontsize, color=color, fontname=fontname)

# Add dividing lines and color for the column to highlight
ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [nrows, nrows], lw=1.5, color='black', marker='', zorder=4)
ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [0, 0], lw=1.5, color='black', marker='', zorder=4)
for x in range(1, nrows):
    ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [x, x], lw=0.5, color='gray', ls='-', zorder=3 , marker='')
    
    ax.fill_between(x=[4.2, 5.4], y1=nrows, y2=0, color='#D32F2F', alpha=0.5, ec='None')

# Adding titles and notes
if metric == 'vaep_value_98':
    plt.text(0.5, 0.86, 'ATOMIC VAEP added per 98', transform=fig.transFigure,
         horizontalalignment='center', fontsize=12, fontfamily='SourceSansPro-SemiBold')
elif metric == 'PAx100':
    plt.text(0.5, 0.86, 'Passes above expectations %', transform=fig.transFigure,
         horizontalalignment='center', fontsize=12, fontfamily='SourceSansPro-SemiBold')
elif metric == 'passes_98':
    plt.text(0.5, 0.86, 'Passes played per 98', transform=fig.transFigure,
         horizontalalignment='center', fontsize=12, fontfamily='SourceSansPro-SemiBold')

plt.text(0.5, 0.83, f'Strikers | Minimum 500 minutes played | Passes from 10% of most valuable clusters',
         transform=fig.transFigure, horizontalalignment='center', fontsize = 4, color = '#4E616C')
fig.suptitle(f'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com',
             horizontalalignment='center', x = 0.5, y = 0.09, fontsize=3, color = "#000000")

#Saving and showing
ax.set_axis_off()
plt.savefig(f'TOP_FOR_CLUSTERS.png', dpi=500, facecolor = "#D7D1CF", bbox_inches = "tight", transparent = True)
plt.show()
No description has been provided for this image
In [ ]: