In [155]:
# Import necessary libraries for data manipulation, visualization, and analysis
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
# Load datasets from CSV files
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
xG = pd.read_csv("xGactions.csv", index_col = 0)

players0 = pd.read_csv("players2021.csv", index_col = 0)
players1 = pd.read_csv("players2122.csv", index_col = 0)
players2 = pd.read_csv("players2223.csv", index_col = 0)
players3 = pd.read_csv("players2324.csv", index_col = 0)
players4 = pd.read_csv("players2425.csv", index_col = 0)

games0 = pd.read_csv("games2021.csv", index_col = 0)
games1 = pd.read_csv("games2122.csv", index_col = 0)
games2 = pd.read_csv("games2223.csv", index_col = 0)
games3 = pd.read_csv("games2324.csv", index_col = 0)
games4 = pd.read_csv("games2425.csv", index_col = 0)

actions0 = pd.read_csv("actions2021.csv", index_col = 0)
actions1 = pd.read_csv("actions2122.csv", index_col = 0)
actions2 = pd.read_csv("actions2223.csv", index_col = 0)
actions3 = pd.read_csv("actions2324.csv", index_col = 0)
actions4 = pd.read_csv("actions2425.csv", index_col = 0)
In [4]:
#Concatenate all the necessary files
players = pd.concat([players0, players1, players2, players3, players4])
games = pd.concat([games0, games1, games2, games3, games4])
actions = pd.concat([actions0, actions1, actions2, actions3, actions4])
In [5]:
#Resetting the action_ids
actions.drop(columns=['action_id'], inplace=True)
actions.reset_index(drop=True, inplace=True)
actions.reset_index(inplace=True)
actions.rename(columns={'index': 'action_id'}, inplace=True)
In [6]:
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to minutes (seconds converted to fraction of minutes)
        return minutes + seconds / 60
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)
In [7]:
#Adding infos for the event data file
atomic = spadl.add_names(actions)
In [8]:
#Merging the xG values
atomic = atomic.merge(xG, how="left")
In [9]:
#Merging players infos with mapping of teams id between fotmob and whoscored
players = players.merge(fb, how="left")
In [10]:
#Creating the total of minutes played for players
mp0 = players.groupby(["player_name", "player_id", "team_name", "fotmob_id", "season_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')
In [11]:
#Merging events with players infos after elaborations
df0 = atomic.merge(players, how='left')
In [12]:
#Creating columns with selected infos from previosu rows
df0["prev_type_name"] = df0.shift(+1, fill_value=0)["type_name"]
df0["prev_team_name"] = df0.shift(+1, fill_value=0)["team_name"]
df0["prev_fotmob_id"] = df0.shift(+1, fill_value=0)["fotmob_id"]
df0["prev_player_name"] = df0.shift(+1, fill_value=0)["player_name"]
df0["prev_player_id"] = df0.shift(+1, fill_value=0)["player_id"]
In [13]:
#Selecting event types I want to work on
df0 = df0[df0['type_name'].isin(['pass', 'dribble', 'interception', 'clearance', 'take_on', 'tackle', 'shot', 'bad_touch',
                                 'keeper_pick_up', 'foul', 'keeper_save', 'cross', 'keeper_claim', 'goal', 'keeper_punch'])]
In [14]:
#Finding shots relative to team frequency
shots0 = df0[df0["type_name"].isin(['shot', 'goal'])]
shotsa = shots0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='shots')
shotsb = shots0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_shots')
shots = shotsa.merge(shotsb)
shots['shots_frequency'] = (shots['shots'] / shots['team_shots']) * 100
shots = shots[['player_name', 'player_id', 'team_name', 'season_id', 'shots_frequency']]
In [15]:
#Finding npxG relative to team frequency
npxGa = shots0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['xG'].sum().reset_index(name='npxG')
npxGb = shots0.groupby(['team_name', 'season_id'], observed=True)['xG'].sum().reset_index(name='team_npxG')
npxG = npxGa.merge(npxGb)
npxG['npxG_frequency'] = (npxG['npxG'] / npxG['team_npxG']) * 100
npxG = npxG[['player_name', 'player_id', 'team_name', 'season_id', 'npxG_frequency']]
In [16]:
#Finding goalkeeping actions relative to team frequency (not so useful probably)
gk0 = df0[df0["type_name"].isin(['keeper_pick_up', 'keeper_save', 'keeper_claim', 'keeper_punch'])]
gka = gk0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='gk_actions')
gkb = gk0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_gk_actions')
gk = gka.merge(gkb)
gk['gk_actions_frequency'] = (gk['gk_actions'] / gk['team_gk_actions']) * 100
gk = gk[['player_name', 'player_id', 'team_name', 'season_id', 'gk_actions_frequency']]
In [17]:
#Finding carries/dribbles and take ons relative to team frequency
drb0 = df0[df0["type_name"].isin(['dribble', 'take_on'])]
drba = drb0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='dribbles')
drbb = drb0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_dribbles')
drb = drba.merge(drbb)
drb['dribbles_frequency'] = (drb['dribbles'] / drb['team_dribbles']) * 100
drb = drb[['player_name', 'player_id', 'team_name', 'season_id', 'dribbles_frequency']]
In [18]:
#Finding crosses relative to team frequency
crss0 = df0[df0["type_name"].isin(['cross'])]
crssa = crss0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='crosses')
crssb = crss0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_crosses')
crss = crssa.merge(crssb)
crss['crosses_frequency'] = (crss['crosses'] / crss['team_crosses']) * 100
crss = crss[['player_name', 'player_id', 'team_name', 'season_id', 'crosses_frequency']]
In [19]:
#Finding defensive actions relative to team frequency
def0 = df0[df0["type_name"].isin(['foul', 'interception', 'clearance', 'tackle'])]
def_actionsa = def0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='def_actions')
def_actionsb = def0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_def_actions')
def_actions = def_actionsa.merge(def_actionsb)
def_actions['def_actions_frequency'] = (def_actions['def_actions'] / def_actions['team_def_actions']) * 100
def_actions = def_actions[['player_name', 'player_id', 'team_name', 'season_id', 'def_actions_frequency']]
In [20]:
#Finding all actions relative to team frequency
actions_adja = df0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='actions_adj')
actions_adjb = df0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_actions_adj')
actions_adj = actions_adja.merge(actions_adjb)
actions_adj['actions_adj_frequency'] = (actions_adj['actions_adj'] / actions_adj['team_actions_adj']) * 100
actions_adj = actions_adj[['player_name', 'player_id', 'team_name', 'season_id', 'actions_adj_frequency']]
In [21]:
#Finding non penalty expected goals assisted relative to team frequency
shots1 = shots0[shots0["team_name"] == shots0['prev_team_name']]
shots2 = shots1[shots1["player_id"] != shots1['prev_player_id']]
shots3 = shots2[shots2["type_name"].isin(['pass', 'cross', 'shot', 'interception', 'tackle', 'bad_touch', 'take_on', 'dribble', 'clearance'])]
npxAa = shots3.groupby(["prev_player_name", "prev_player_id", "prev_team_name", "season_id"], observed=True)['xG'].sum().reset_index(name='npxA')
npxAb = shots3.groupby(["prev_team_name", "season_id"], observed=True)['xG'].sum().reset_index(name='team_npxA')
npxA = npxAa.merge(npxAb)
npxA['npxA_frequency'] = (npxA['npxA'] / npxA['team_npxA']) * 100
npxA = npxA[["prev_player_name", "prev_player_id", "prev_team_name", 'season_id', 'npxA_frequency']]
In [22]:
#Renaming non penalty expected goals assisted columns for merging
npxA = npxA.rename(columns={
    "prev_player_name": "player_name",
    "prev_player_id": "player_id",
    "prev_team_name": "team_name",
    "prev_fotmob_id": "fotmob_id",
    "prev_type_name": "type_name"
})
In [23]:
#Finding actions in the box relative to team frequency
dfx0 = df0[df0['start_y_a0'] >= 13.885]
dfx1 = dfx0[dfx0['start_y_a0'] <= 54.115]
dfx = dfx1[dfx1['start_x_a0'] >= 88.5]
boxa = dfx.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='box')
boxb = dfx.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_box')
box = boxa.merge(boxb)
box['box_frequency'] = (box['box'] / box['team_box']) * 100
box = box[['player_name', 'player_id', 'team_name', 'season_id', 'box_frequency']]
In [24]:
#Merging all the tables created to create the main one
mp = (mp0
       .merge(actions_adj, how="left")
       .merge(def_actions, how="left")
       .merge(crss, how="left")
       .merge(drb, how="left")
       .merge(shots, how="left")
       .merge(npxG, how="left")
       .merge(npxA, how="left")
       .merge(box, how="left")
       .merge(gk, how="left"))
In [25]:
#We fill cells where there are NaN values with 0
mp = mp.fillna(0)
In [26]:
# Define the number of zones of the pitch we want to count how many actions a player did inside of them
length_zones = 52.5
width_zones = 34

# Define the size of each zone
zone_length = 105 / length_zones
zone_width = 68 / width_zones

# Assign each event to a zone
df0['zone_x'] = (df0['start_x_a0'] // zone_length).astype(int)
df0['zone_y'] = (df0['start_y_a0'] // zone_width).astype(int)
In [27]:
# Create a single column representing the zone (optional, for grouping)
df0['zone'] = df0['zone_x'].astype(str) + '-' + df0['zone_y'].astype(str)

# Group by game, player, and zone and count the events
zone_counts = (
    df0.groupby(['player_id', 'player_name', 'team_name', 'season_id', 'zone_x', 'zone_y'])
    .size()
    .reset_index(name='event_count')
)

# Pivot the data to create one column per zone
pivoted_counts = zone_counts.pivot_table(
    index=['player_id', 'player_name', 'team_name', 'season_id'],  # Rows are game_id and player_id
    columns=['zone_x', 'zone_y'],   # Columns are zones
    values='event_count',          # Values are the counts
    fill_value=0                   # Fill missing zones with 0
)

# Flatten the MultiIndex columns for better readability
pivoted_counts.columns = [f'zone_{col[0]}_{col[1]}' for col in pivoted_counts.columns]

# Reset index to make game_id and player_id regular columns
pivoted_counts.reset_index(inplace=True)
In [28]:
# Group by game, player, and zone and count the events
team_zone_counts = (
    df0.groupby(['team_name', 'season_id', 'zone_x', 'zone_y'])
    .size()
    .reset_index(name='event_count')
)

# Pivot the data to create one column per zone
team_counts = team_zone_counts.pivot_table(
    index=['team_name', 'season_id'],  # Rows are game_id and player_id
    columns=['zone_x', 'zone_y'],   # Columns are zones
    values='event_count',          # Values are the counts
    fill_value=0                   # Fill missing zones with 0
)

# Flatten the MultiIndex columns for better readability
team_counts.columns = [f'zone_{col[0]}_{col[1]}' for col in team_counts.columns]

# Reset index to make game_id and player_id regular columns
team_counts.reset_index(inplace=True)
In [156]:
# Identify zone columns
zone_columns = [col for col in pivoted_counts.columns if col.startswith('zone_')]

# Merge on team_name and season_id
merged_df = pd.merge(pivoted_counts, team_counts, on=['team_name', 'season_id'], suffixes=('_player', '_team'))

# Normalize player zones by team total into the same zones, but keep 0 where player zones are 0
for zone in zone_columns:
    merged_df[zone] = np.where(
        merged_df[f"{zone}_player"] == 0,  # Condition: If player zone count is 0
        0,  # Keep 0 where player has 0 touches
        merged_df[f"{zone}_player"] / merged_df[f"{zone}_team"]  # Otherwise, perform division
    )

final_df0 = merged_df[['player_id', 'player_name', 'team_name', 'season_id'] + zone_columns]
In [30]:
#Merging the dataframe with number of actions in every zone with the metrics table
final_df = final_df0.merge(mp, how="left")
final_df = final_df.fillna(0)
In [31]:
#Keeping only players with at least 500 minutes played
final_df = final_df[final_df['minutes_played'] >= 500]
In [32]:
#Selecting columns to base the cluster on
columns_to_select = final_df.columns[4:]
result = list(columns_to_select[:-10]) + list(columns_to_select[-9:])

# Apply selection to the dataframe
X = final_df[result].values
In [33]:
#Importing clustering packages
from sklearn.preprocessing import StandardScaler
from umap import UMAP
In [34]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
In [35]:
#We watch how many points we have so to consider parameters 
len(X_scaled)
Out[35]:
33406
In [36]:
# Apply UMAP to reduce the dimensions further
umap = UMAP(min_dist=0.0, n_neighbors=1500, random_state=2213)
comps = umap.fit_transform(X_scaled)
/Users/davidegualano/anaconda3/envs/UMAP/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
In [37]:
# Plot the reduced dimensions
fig, ax = plt.subplots(figsize=(16, 12))
scatter = ax.scatter(comps[:, 0], comps[:, 1], c='red', s=3)

# Set labels
ax.set_xlabel('UMAP Component 1')
ax.set_ylabel('UMAP Component 2')

# Show plot
plt.show()
No description has been provided for this image
In [38]:
#Import necessary libraries to finish the clustering
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.mixture import GaussianMixture
from kneed import KneeLocator
In [39]:
#Define function to find optimal number of clusters
def compare_cluster_counts(data, player_info_df, cluster_counts=[11, 15, 20], method='kmeans'):
    """
    Compare multiple cluster counts to help determine the most appropriate number
    
    Args:
        data: UMAP-transformed data
        player_info_df: DataFrame with player information
        cluster_counts: List of cluster counts to compare
        method: Clustering method ('kmeans', 'agglomerative', or 'gmm')
    """

    # Domain knowledge considerations
    print("\n=== FOOTBALL POSITION CONSIDERATIONS ===")
    print("Traditional positions: 10 main positions (GK, CB, FB, WB, MW, AW, DM, CM, AM, ST)")
    print("With specializations: Between 17-22 specialized roles if we consider using multiple of the same positions, for example left and right wingers and so on")
    print("Potentially between 25-31 if we add hybrids: between CB-FB*2, CB-DM, AM-AW*2, AM-ST*2, AW-ST*2")
    
    print("\n=== RECOMMENDED APPROACH ===")
    print("Consequenlty we try to find the best number of cluster counts between 17-22 and 26-31")
    
    results = {}
    
    for n_clusters in cluster_counts:
        # Apply clustering
        if method == 'kmeans':
            clustering = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        elif method == 'agglomerative':
            clustering = AgglomerativeClustering(n_clusters=n_clusters)
        elif method == 'gmm':
            clustering = GaussianMixture(n_components=n_clusters, random_state=42)
        
        # Get cluster labels
        cluster_labels = clustering.fit_predict(data)
        
        # Calculate metrics
        silhouette = silhouette_score(data, cluster_labels)
        ch_score = calinski_harabasz_score(data, cluster_labels)
        
        # Store results
        results[n_clusters] = {
            'silhouette': silhouette,
            'ch_score': ch_score,
            'labels': cluster_labels
        }
        
        # Get cluster sizes
        unique, counts = np.unique(cluster_labels, return_counts=True)
        cluster_sizes = dict(zip(unique, counts))
        results[n_clusters]['cluster_sizes'] = cluster_sizes
    
    # Compare metrics across cluster counts
    plt.figure(figsize=(10, 6))
    plt.plot(results.keys(), [r['silhouette'] for r in results.values()], 'bo-', label='Silhouette Score')
    plt.grid(True)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score Comparison')
    plt.legend()
    plt.show()
    
    return results
In [40]:
#Define function to analyze clusters
def analyze_clusters(data, player_info_df, n_clusters, method='kmeans'):
    """
    Create clusters and analyze top representative players for each cluster
    
    Args:
        data: UMAP-transformed data
        player_info_df: DataFrame with player information (should include 'player_name' column)
        n_clusters: Number of clusters to create
        method: Clustering method ('kmeans', 'agglomerative', or 'gmm')
    
    Returns:
        DataFrame with cluster assignments and cluster analysis information
    """
    print(f"Creating {n_clusters} clusters using {method}...")
    
    # Apply clustering
    if method == 'kmeans':
        clustering = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    elif method == 'agglomerative':
        clustering = AgglomerativeClustering(n_clusters=n_clusters)
    elif method == 'gmm':
        clustering = GaussianMixture(n_components=n_clusters, random_state=42)
    else:
        raise ValueError("Method must be 'kmeans', 'agglomerative', or 'gmm'")
    
    # Get cluster labels
    cluster_labels = clustering.fit_predict(data)
    
    # Add cluster labels to player info
    player_info_df = player_info_df.copy()
    player_info_df['cluster'] = cluster_labels
    
    # Visualize clusters
    plt.figure(figsize=(16, 12))
    scatter = plt.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis', s=3, alpha=0.7)
    
    # Add cluster centers if using KMeans
    centers = None
    if method == 'kmeans':
        centers = clustering.cluster_centers_
        plt.scatter(centers[:, 0], centers[:, 1], c='#000000', s=200, alpha=0.5, marker='X', edgecolors='#FFFFFF')
    
    plt.title(f'Player Clusters using {method.capitalize()} (n={n_clusters})')
    plt.xlabel('UMAP Component 1')
    plt.ylabel('UMAP Component 2')
    plt.colorbar(scatter, label='Cluster')
    plt.tight_layout()
    plt.show()
    
    # Find representative players for each cluster
    cluster_analysis = {}
    player_name_col = 'player_name' if 'player_name' in player_info_df.columns else player_info_df.columns[0]
    
    for cluster_id in range(n_clusters):
        cluster_members = player_info_df[player_info_df['cluster'] == cluster_id]
        
        # If the data has distance to centroid (for K-means)
        if method == 'kmeans' and centers is not None:
            # Calculate distance to centroid for all players in this cluster
            cluster_data = data[cluster_labels == cluster_id]
            centroid = centers[cluster_id]
            distances = np.linalg.norm(cluster_data - centroid, axis=1)
            
            # Get indices of players in this cluster
            cluster_indices = np.where(cluster_labels == cluster_id)[0]
            
            # Get top 10 closest players to centroid (or fewer if cluster is smaller)
            top_n = min(10, len(distances))
            closest_indices = cluster_indices[np.argsort(distances)[:top_n]]
            representative_players = player_info_df.iloc[closest_indices]
            
        else:
            # Just take random 10 players from cluster if not K-means
            sample_size = min(10, len(cluster_members))
            if sample_size > 0:
                representative_players = cluster_members.sample(sample_size)
            else:
                representative_players = pd.DataFrame()
        
        cluster_analysis[cluster_id] = {
            'count': len(cluster_members),
            'representative_players': representative_players
        }
    
    # Generate template for manual position naming
    print("\nTemplate for position naming:")
    print("label_mapping = {")
    for cluster_id in range(n_clusters):
        players = cluster_analysis[cluster_id]['representative_players']
        if len(players) > 0:
            player_list = ", ".join(players[player_name_col].head(5).tolist())
        else:
            player_list = "No players in this cluster"
        print(f"    {cluster_id}: '',  # {player_list}")
    print("}")
    
    return player_info_df, cluster_analysis
In [41]:
#Create player_df which only players info
player_df = final_df[['player_id', 'player_name', 'team_name', 'season_id']]
In [42]:
comparison_results = compare_cluster_counts(comps, player_df, cluster_counts=[17, 18, 19, 20, 21, 22, 26, 27, 28, 29, 30, 31])
=== FOOTBALL POSITION CONSIDERATIONS ===
Traditional positions: 10 main positions (GK, CB, FB, WB, MW, AW, DM, CM, AM, ST)
With specializations: Between 17-22 specialized roles if we consider using multiple of the same positions, for example left and right wingers and so on
Potentially between 25-31 if we add hybrids: between CB-FB*2, CB-DM, AM-AW*2, AM-ST*2, AW-ST*2

=== RECOMMENDED APPROACH ===
Consequenlty we try to find the best number of cluster counts between 17-22 and 26-31
No description has been provided for this image
In [43]:
#Run cluster analysis with chosen number of clusters
n_clusters = 22  # Change this to your desired number I prefer more granularity to allow a bit more interpretation on my side given the minor difference in score
player_df_with_clusters, cluster_info = analyze_clusters(comps, player_df, n_clusters, method='agglomerative')
Creating 22 clusters using agglomerative...
No description has been provided for this image
Template for position naming:
label_mapping = {
    0: '',  # Stefan Ortega, Steve Arnold, Spencer Richey, Elías Ólafsson, Seny Dieng
    1: '',  # Tristan Crama, Marvin Loría, Andreas Voglsammer, Niclas Eliasson, Jamie Leweling
    2: '',  # Hannes Wolf, Calvin Harris, Maximilian Philipp, Michael Baidoo, Aaron Drinan
    3: '',  # Lennart Thy, Jamiro Monteiro, Matteo Pessina, Wilder Cartagena, Lewis Fiorini
    4: '',  # Hasan Kaldirim, Shane Ferguson, Yuto Nagatomo, Aboubakary Koïta, Richard Tait
    5: '',  # Jayden Nelson, Nene Dorgeles, Andrey Egorychev, Rochinha, Juan Gauto
    6: '',  # Gerard Piqué, Damon Mirani, Paul Huntington, Cameron Humphreys, Gerzino Nyamsi
    7: '',  # Serdar Gürler, Griffin Yow, Rachid Ghezzal, Alex Iwobi, Geoffry Hairemans
    8: '',  # Ricardo Esgaio, Boyd Lucassen, Mustafa Eskihellaç, Mattias Johansson, Odilon Kossounou
    9: '',  # Show, Mattéo Guendouzi, Lamine Camara, Iván Marcone, Richard
    10: '',  # Rade Krunic, Adrián Bernabé, Regan Slater, Chris Durkin, Adalberto Carrasquilla
    11: '',  # Rafael Cabral, Cristian Balgradean, Yehvann Diouf, Dan Bentley, Gonzalo Marinelli
    12: '',  # Serdar Aziz, Oleg Kozhemyakin, Ismaël Traoré, Ruslan Litvinov, Rein Van Helden
    13: '',  # Kenneth Paal, Ryan Manning, Dimitri Liénard, Emiliano Papa, Derrick Köhn
    14: '',  # Jeff Hendrick, Marlon Hairston, Frank Onyeka, Dmitri Tsypchenko, Jay Emmanuel-Thomas
    15: '',  # Cédric Kipré, Ben Barclay, Martin Hongla, Jon Martín, Danny Batth
    16: '',  # Lukas Nmecha, Steve Mounié, Mathias De Amorim, Gabriel Jesus, Eric Bicfalvi
    17: '',  # Kirill Suslov, Ed Turns, Malang Sarr, Derrick Williams, Igor Julio
    18: '',  # Duncan Watmore, Antonio Sanabria, Janik Haberer, Jeremy Ebobisse, Kike Pérez
    19: '',  # Macaulay Gillesphey, Tristan Blackmon, Stjepan Radeljic, Tomás Cardona, Francesco Acerbi
    20: '',  # Weston McKennie, Aleksey Mironov, Daniel Bragança, Matheus Henrique, Kristijan Bistrovic
    21: '',  # Luca Oyen, Horacio Tijanovich, Andreas Voglsammer, Amara Baby, Fábio Silva
}
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [147]:
#Creating list of players in a specific cluster
C = player_df_with_clusters[player_df_with_clusters['cluster'] == 18]
clist = C.player_name.unique()
In [148]:
#Visualizing heatmap of players in the cluster for interpretation
from mplsoccer import Pitch, VerticalPitch, lines
from matplotlib.colors import LinearSegmentedColormap
dfx = df0[df0['player_name'].isin(clist)]

pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=1.25, line_color='#000000', 
              line_zorder=2, pitch_color='#D7D1CF')

fig, axs = pitch.grid(endnote_height=0.03, endnote_space=0, grid_width=0.88, left=0.025, title_height=0.06, title_space=0, axis=False,
                      grid_height=0.86)

fig.set_facecolor('#D7D1CF')

# Define the colormap with #D7D1CF as the lowest value
cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#D7D1CF', '#FFFFFF', '#FFFF80', '#FF8000', '#800000', '#000000'], N=256)

bin_statisticplayer = pitch.bin_statistic(dfx.start_x_a0, dfx.start_y_a0, statistic='count', bins=(52.5, 34))

pcm = pitch.heatmap(bin_statisticplayer, ax=axs['pitch'], cmap=cmap)
No description has been provided for this image
In [149]:
#Watching the list of players in the cluster that play in certain teams for interpretation
X0 = C[C['team_name'].isin(['Juventus', 'Man City', 'Barcelona', 'Atalanta', 'Liverpool', 'Inter', 'AC Milan', 'Real Madrid', 
                            'Arsenal', 'Chelsea', 'Bayern Munich', 'Man Utd', 'PSG', 'Borussia Dortmund'])]
In [150]:
X0.head(60)
Out[150]:
player_id player_name team_name season_id cluster
12 3281.0 Zlatan Ibrahimovic AC Milan 2021.0 18
13 3281.0 Zlatan Ibrahimovic AC Milan 2122.0 18
50 5583.0 Cristiano Ronaldo Man Utd 2223.0 18
1161 24328.0 Edinson Cavani Man Utd 2021.0 18
1162 24328.0 Edinson Cavani Man Utd 2122.0 18
1168 24444.0 Olivier Giroud AC Milan 2122.0 18
1171 24444.0 Olivier Giroud Chelsea 2021.0 18
1280 25605.0 Anthony Modeste Borussia Dortmund 2223.0 18
1910 33404.0 Eden Hazard Real Madrid 2122.0 18
2103 34693.0 Marko Arnautovic Inter 2324.0 18
2104 34693.0 Marko Arnautovic Inter 2425.0 18
2622 41073.0 Duván Zapata Atalanta 2223.0 18
2889 44120.0 Pierre-Emerick Aubameyang Chelsea 2223.0 18
3365 68585.0 Luuk de Jong Barcelona 2122.0 18
3918 73078.0 Alexandre Lacazette Arsenal 2122.0 18
3964 73494.0 Martin Braithwaite Barcelona 2021.0 18
4086 74603.0 Joselu Real Madrid 2324.0 18
4456 78498.0 Romelu Lukaku Chelsea 2122.0 18
5248 84190.0 Luis Muriel Atalanta 2223.0 18
5249 84190.0 Luis Muriel Atalanta 2324.0 18
5926 91213.0 Álvaro Morata AC Milan 2425.0 18
6483 96182.0 Roberto Firmino Liverpool 2122.0 18
6511 96449.0 Julian Draxler PSG 2122.0 18
7039 101735.0 Ante Rebic AC Milan 2223.0 18
7259 104257.0 Arkadiusz Milik Juventus 2223.0 18
7260 104257.0 Arkadiusz Milik Juventus 2324.0 18
7645 106964.0 Mauro Icardi PSG 2122.0 18
9789 122366.0 Anthony Martial Man Utd 2223.0 18
10037 124688.0 Divock Origi AC Milan 2223.0 18
10208 125378.0 Joaquín Correa Inter 2223.0 18
10854 129354.0 Aleksey Miranchuk Atalanta 2021.0 18
12850 137467.0 Marco Asensio PSG 2425.0 18
14873 235755.0 Diogo Jota Liverpool 2223.0 18
14874 235755.0 Diogo Jota Liverpool 2324.0 18
14875 235755.0 Diogo Jota Liverpool 2425.0 18
14974 236544.0 Sébastien Haller Borussia Dortmund 2223.0 18
15082 238916.0 Mikel Merino Arsenal 2425.0 18
16954 279379.0 Gabriel Jesus Arsenal 2223.0 18
16955 279379.0 Gabriel Jesus Arsenal 2324.0 18
16956 279379.0 Gabriel Jesus Arsenal 2425.0 18
18673 299451.0 Ademola Lookman Atalanta 2223.0 18
18908 300945.0 Christopher Nkunku Chelsea 2324.0 18
19161 302692.0 Christian Pulisic Chelsea 2122.0 18
20103 315227.0 Erling Haaland Man City 2324.0 18
20154 315369.0 Dusan Vlahovic Juventus 2122.0 18
20221 315755.0 Luka Jovic AC Milan 2324.0 18
20390 317506.0 Tammy Abraham AC Milan 2425.0 18
20391 317506.0 Tammy Abraham Chelsea 2021.0 18
22250 328990.0 Moise Kean Juventus 2223.0 18
22987 333542.0 Randal Kolo Muani PSG 2425.0 18
24323 343346.0 Mason Mount Man Utd 2324.0 18
24934 345845.0 Eddie Nketiah Arsenal 2223.0 18
24935 345845.0 Eddie Nketiah Arsenal 2324.0 18
25266 349207.0 Rafael Leão AC Milan 2021.0 18
25715 352825.0 Cody Gakpo Liverpool 2223.0 18
26613 358801.0 Mehdi Taremi Inter 2425.0 18
27455 362431.0 João Félix AC Milan 2425.0 18
27461 362431.0 João Félix Chelsea 2425.0 18
28047 363686.0 Emile Smith Rowe Arsenal 2021.0 18
28414 365409.0 Julián Álvarez Man City 2223.0 18
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [151]:
# Define the mapping of old labels to new labels given my interpretation of the heatmaps
label_mapping = {
    0: 'GK',
    1: 'RWB',
    2: 'SS', #Shadow Striker
    3: 'LCM',
    4: 'LWB',
    5: 'AWL',
    6: 'CB',
    7: 'AWR',
    8: 'RWB',
    9: 'DM',
    10: 'CM',
    11: 'GK',
    12: 'RCB',
    13: 'LWB', 
    14: 'CM',
    15: 'RCB',
    16: 'ST',
    17: 'LCB',
    18: 'ST',
    19: 'LCB',
    20: 'DM',
    21: 'AWL'
}

# Map the labels directly
player_df_with_clusters['cluster'] = player_df_with_clusters['cluster'].map(label_mapping)
In [152]:
#Creating the final table with players and positions
position = player_df_with_clusters.groupby(["player_name", "player_id", "team_name", "season_id"], observed=True)['cluster'].unique().reset_index(name='position')
position['position'] = position['position'].astype(str)
position['position'] = position['position'].str.strip("['")
position['position'] = position['position'].str.strip("']")
In [153]:
#Saving the table
position.to_csv("clustered_position.csv")
In [154]:
#Showing the table
position
Out[154]:
player_name player_id team_name season_id position
0 AJ Delagarza JR 72638.0 Inter Miami CF 2020.0 RWB
1 AJ Delagarza JR 72638.0 New England 2021.0 RCB
2 Aapo Halme 202262.0 Barnsley 2021.0 SS
3 Aaron Appindangoyé 139387.0 Sivasspor 2021.0 RCB
4 Aaron Appindangoyé 139387.0 Sivasspor 2223.0 RCB
... ... ... ... ... ...
33401 Ørjan Nyland 39187.0 Reading 2122.0 GK
33402 Ørjan Nyland 39187.0 Sevilla 2324.0 GK
33403 Ørjan Nyland 39187.0 Sevilla 2425.0 GK
33404 Úmaro Embaló 455568.0 Fortuna Sittard 2223.0 AWL
33405 Úmaro Embaló 455568.0 Rio Ave 2324.0 LWB

33406 rows × 5 columns

In [ ]: