In [155]:
# Import necessary libraries for data manipulation, visualization, and analysis
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
# Suppress warnings
warnings.filterwarnings("ignore")
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
name='SourceSansPro-SemiBold'
)
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
# Load datasets from CSV files
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
xG = pd.read_csv("xGactions.csv", index_col = 0)
players0 = pd.read_csv("players2021.csv", index_col = 0)
players1 = pd.read_csv("players2122.csv", index_col = 0)
players2 = pd.read_csv("players2223.csv", index_col = 0)
players3 = pd.read_csv("players2324.csv", index_col = 0)
players4 = pd.read_csv("players2425.csv", index_col = 0)
games0 = pd.read_csv("games2021.csv", index_col = 0)
games1 = pd.read_csv("games2122.csv", index_col = 0)
games2 = pd.read_csv("games2223.csv", index_col = 0)
games3 = pd.read_csv("games2324.csv", index_col = 0)
games4 = pd.read_csv("games2425.csv", index_col = 0)
actions0 = pd.read_csv("actions2021.csv", index_col = 0)
actions1 = pd.read_csv("actions2122.csv", index_col = 0)
actions2 = pd.read_csv("actions2223.csv", index_col = 0)
actions3 = pd.read_csv("actions2324.csv", index_col = 0)
actions4 = pd.read_csv("actions2425.csv", index_col = 0)
In [4]:
#Concatenate all the necessary files
players = pd.concat([players0, players1, players2, players3, players4])
games = pd.concat([games0, games1, games2, games3, games4])
actions = pd.concat([actions0, actions1, actions2, actions3, actions4])
In [5]:
#Resetting the action_ids
actions.drop(columns=['action_id'], inplace=True)
actions.reset_index(drop=True, inplace=True)
actions.reset_index(inplace=True)
actions.rename(columns={'index': 'action_id'}, inplace=True)
In [6]:
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
try:
# Convert to string in case it's a float (e.g., NaN)
time_str = str(time_str)
# Split the time string into minutes and seconds
minutes, seconds = map(int, time_str.split(':'))
# Convert total time to minutes (seconds converted to fraction of minutes)
return minutes + seconds / 60
except (ValueError, AttributeError):
# Handle cases where the conversion fails (e.g., NaN or bad format)
return 0 # or use `np.nan` if you prefer to mark as missing
# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)
In [7]:
#Adding infos for the event data file
atomic = spadl.add_names(actions)
In [8]:
#Merging the xG values
atomic = atomic.merge(xG, how="left")
In [9]:
#Merging players infos with mapping of teams id between fotmob and whoscored
players = players.merge(fb, how="left")
In [10]:
#Creating the total of minutes played for players
mp0 = players.groupby(["player_name", "player_id", "team_name", "fotmob_id", "season_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')
In [11]:
#Merging events with players infos after elaborations
df0 = atomic.merge(players, how='left')
In [12]:
#Creating columns with selected infos from previosu rows
df0["prev_type_name"] = df0.shift(+1, fill_value=0)["type_name"]
df0["prev_team_name"] = df0.shift(+1, fill_value=0)["team_name"]
df0["prev_fotmob_id"] = df0.shift(+1, fill_value=0)["fotmob_id"]
df0["prev_player_name"] = df0.shift(+1, fill_value=0)["player_name"]
df0["prev_player_id"] = df0.shift(+1, fill_value=0)["player_id"]
In [13]:
#Selecting event types I want to work on
df0 = df0[df0['type_name'].isin(['pass', 'dribble', 'interception', 'clearance', 'take_on', 'tackle', 'shot', 'bad_touch',
'keeper_pick_up', 'foul', 'keeper_save', 'cross', 'keeper_claim', 'goal', 'keeper_punch'])]
In [14]:
#Finding shots relative to team frequency
shots0 = df0[df0["type_name"].isin(['shot', 'goal'])]
shotsa = shots0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='shots')
shotsb = shots0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_shots')
shots = shotsa.merge(shotsb)
shots['shots_frequency'] = (shots['shots'] / shots['team_shots']) * 100
shots = shots[['player_name', 'player_id', 'team_name', 'season_id', 'shots_frequency']]
In [15]:
#Finding npxG relative to team frequency
npxGa = shots0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['xG'].sum().reset_index(name='npxG')
npxGb = shots0.groupby(['team_name', 'season_id'], observed=True)['xG'].sum().reset_index(name='team_npxG')
npxG = npxGa.merge(npxGb)
npxG['npxG_frequency'] = (npxG['npxG'] / npxG['team_npxG']) * 100
npxG = npxG[['player_name', 'player_id', 'team_name', 'season_id', 'npxG_frequency']]
In [16]:
#Finding goalkeeping actions relative to team frequency (not so useful probably)
gk0 = df0[df0["type_name"].isin(['keeper_pick_up', 'keeper_save', 'keeper_claim', 'keeper_punch'])]
gka = gk0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='gk_actions')
gkb = gk0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_gk_actions')
gk = gka.merge(gkb)
gk['gk_actions_frequency'] = (gk['gk_actions'] / gk['team_gk_actions']) * 100
gk = gk[['player_name', 'player_id', 'team_name', 'season_id', 'gk_actions_frequency']]
In [17]:
#Finding carries/dribbles and take ons relative to team frequency
drb0 = df0[df0["type_name"].isin(['dribble', 'take_on'])]
drba = drb0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='dribbles')
drbb = drb0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_dribbles')
drb = drba.merge(drbb)
drb['dribbles_frequency'] = (drb['dribbles'] / drb['team_dribbles']) * 100
drb = drb[['player_name', 'player_id', 'team_name', 'season_id', 'dribbles_frequency']]
In [18]:
#Finding crosses relative to team frequency
crss0 = df0[df0["type_name"].isin(['cross'])]
crssa = crss0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='crosses')
crssb = crss0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_crosses')
crss = crssa.merge(crssb)
crss['crosses_frequency'] = (crss['crosses'] / crss['team_crosses']) * 100
crss = crss[['player_name', 'player_id', 'team_name', 'season_id', 'crosses_frequency']]
In [19]:
#Finding defensive actions relative to team frequency
def0 = df0[df0["type_name"].isin(['foul', 'interception', 'clearance', 'tackle'])]
def_actionsa = def0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='def_actions')
def_actionsb = def0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_def_actions')
def_actions = def_actionsa.merge(def_actionsb)
def_actions['def_actions_frequency'] = (def_actions['def_actions'] / def_actions['team_def_actions']) * 100
def_actions = def_actions[['player_name', 'player_id', 'team_name', 'season_id', 'def_actions_frequency']]
In [20]:
#Finding all actions relative to team frequency
actions_adja = df0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='actions_adj')
actions_adjb = df0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_actions_adj')
actions_adj = actions_adja.merge(actions_adjb)
actions_adj['actions_adj_frequency'] = (actions_adj['actions_adj'] / actions_adj['team_actions_adj']) * 100
actions_adj = actions_adj[['player_name', 'player_id', 'team_name', 'season_id', 'actions_adj_frequency']]
In [21]:
#Finding non penalty expected goals assisted relative to team frequency
shots1 = shots0[shots0["team_name"] == shots0['prev_team_name']]
shots2 = shots1[shots1["player_id"] != shots1['prev_player_id']]
shots3 = shots2[shots2["type_name"].isin(['pass', 'cross', 'shot', 'interception', 'tackle', 'bad_touch', 'take_on', 'dribble', 'clearance'])]
npxAa = shots3.groupby(["prev_player_name", "prev_player_id", "prev_team_name", "season_id"], observed=True)['xG'].sum().reset_index(name='npxA')
npxAb = shots3.groupby(["prev_team_name", "season_id"], observed=True)['xG'].sum().reset_index(name='team_npxA')
npxA = npxAa.merge(npxAb)
npxA['npxA_frequency'] = (npxA['npxA'] / npxA['team_npxA']) * 100
npxA = npxA[["prev_player_name", "prev_player_id", "prev_team_name", 'season_id', 'npxA_frequency']]
In [22]:
#Renaming non penalty expected goals assisted columns for merging
npxA = npxA.rename(columns={
"prev_player_name": "player_name",
"prev_player_id": "player_id",
"prev_team_name": "team_name",
"prev_fotmob_id": "fotmob_id",
"prev_type_name": "type_name"
})
In [23]:
#Finding actions in the box relative to team frequency
dfx0 = df0[df0['start_y_a0'] >= 13.885]
dfx1 = dfx0[dfx0['start_y_a0'] <= 54.115]
dfx = dfx1[dfx1['start_x_a0'] >= 88.5]
boxa = dfx.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='box')
boxb = dfx.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_box')
box = boxa.merge(boxb)
box['box_frequency'] = (box['box'] / box['team_box']) * 100
box = box[['player_name', 'player_id', 'team_name', 'season_id', 'box_frequency']]
In [24]:
#Merging all the tables created to create the main one
mp = (mp0
.merge(actions_adj, how="left")
.merge(def_actions, how="left")
.merge(crss, how="left")
.merge(drb, how="left")
.merge(shots, how="left")
.merge(npxG, how="left")
.merge(npxA, how="left")
.merge(box, how="left")
.merge(gk, how="left"))
In [25]:
#We fill cells where there are NaN values with 0
mp = mp.fillna(0)
In [26]:
# Define the number of zones of the pitch we want to count how many actions a player did inside of them
length_zones = 52.5
width_zones = 34
# Define the size of each zone
zone_length = 105 / length_zones
zone_width = 68 / width_zones
# Assign each event to a zone
df0['zone_x'] = (df0['start_x_a0'] // zone_length).astype(int)
df0['zone_y'] = (df0['start_y_a0'] // zone_width).astype(int)
In [27]:
# Create a single column representing the zone (optional, for grouping)
df0['zone'] = df0['zone_x'].astype(str) + '-' + df0['zone_y'].astype(str)
# Group by game, player, and zone and count the events
zone_counts = (
df0.groupby(['player_id', 'player_name', 'team_name', 'season_id', 'zone_x', 'zone_y'])
.size()
.reset_index(name='event_count')
)
# Pivot the data to create one column per zone
pivoted_counts = zone_counts.pivot_table(
index=['player_id', 'player_name', 'team_name', 'season_id'], # Rows are game_id and player_id
columns=['zone_x', 'zone_y'], # Columns are zones
values='event_count', # Values are the counts
fill_value=0 # Fill missing zones with 0
)
# Flatten the MultiIndex columns for better readability
pivoted_counts.columns = [f'zone_{col[0]}_{col[1]}' for col in pivoted_counts.columns]
# Reset index to make game_id and player_id regular columns
pivoted_counts.reset_index(inplace=True)
In [28]:
# Group by game, player, and zone and count the events
team_zone_counts = (
df0.groupby(['team_name', 'season_id', 'zone_x', 'zone_y'])
.size()
.reset_index(name='event_count')
)
# Pivot the data to create one column per zone
team_counts = team_zone_counts.pivot_table(
index=['team_name', 'season_id'], # Rows are game_id and player_id
columns=['zone_x', 'zone_y'], # Columns are zones
values='event_count', # Values are the counts
fill_value=0 # Fill missing zones with 0
)
# Flatten the MultiIndex columns for better readability
team_counts.columns = [f'zone_{col[0]}_{col[1]}' for col in team_counts.columns]
# Reset index to make game_id and player_id regular columns
team_counts.reset_index(inplace=True)
In [156]:
# Identify zone columns
zone_columns = [col for col in pivoted_counts.columns if col.startswith('zone_')]
# Merge on team_name and season_id
merged_df = pd.merge(pivoted_counts, team_counts, on=['team_name', 'season_id'], suffixes=('_player', '_team'))
# Normalize player zones by team total into the same zones, but keep 0 where player zones are 0
for zone in zone_columns:
merged_df[zone] = np.where(
merged_df[f"{zone}_player"] == 0, # Condition: If player zone count is 0
0, # Keep 0 where player has 0 touches
merged_df[f"{zone}_player"] / merged_df[f"{zone}_team"] # Otherwise, perform division
)
final_df0 = merged_df[['player_id', 'player_name', 'team_name', 'season_id'] + zone_columns]
In [30]:
#Merging the dataframe with number of actions in every zone with the metrics table
final_df = final_df0.merge(mp, how="left")
final_df = final_df.fillna(0)
In [31]:
#Keeping only players with at least 500 minutes played
final_df = final_df[final_df['minutes_played'] >= 500]
In [32]:
#Selecting columns to base the cluster on
columns_to_select = final_df.columns[4:]
result = list(columns_to_select[:-10]) + list(columns_to_select[-9:])
# Apply selection to the dataframe
X = final_df[result].values
In [33]:
#Importing clustering packages
from sklearn.preprocessing import StandardScaler
from umap import UMAP
In [34]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
In [35]:
#We watch how many points we have so to consider parameters
len(X_scaled)
Out[35]:
33406
In [36]:
# Apply UMAP to reduce the dimensions further
umap = UMAP(min_dist=0.0, n_neighbors=1500, random_state=2213)
comps = umap.fit_transform(X_scaled)
/Users/davidegualano/anaconda3/envs/UMAP/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn( OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
In [37]:
# Plot the reduced dimensions
fig, ax = plt.subplots(figsize=(16, 12))
scatter = ax.scatter(comps[:, 0], comps[:, 1], c='red', s=3)
# Set labels
ax.set_xlabel('UMAP Component 1')
ax.set_ylabel('UMAP Component 2')
# Show plot
plt.show()
In [38]:
#Import necessary libraries to finish the clustering
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.mixture import GaussianMixture
from kneed import KneeLocator
In [39]:
#Define function to find optimal number of clusters
def compare_cluster_counts(data, player_info_df, cluster_counts=[11, 15, 20], method='kmeans'):
"""
Compare multiple cluster counts to help determine the most appropriate number
Args:
data: UMAP-transformed data
player_info_df: DataFrame with player information
cluster_counts: List of cluster counts to compare
method: Clustering method ('kmeans', 'agglomerative', or 'gmm')
"""
# Domain knowledge considerations
print("\n=== FOOTBALL POSITION CONSIDERATIONS ===")
print("Traditional positions: 10 main positions (GK, CB, FB, WB, MW, AW, DM, CM, AM, ST)")
print("With specializations: Between 17-22 specialized roles if we consider using multiple of the same positions, for example left and right wingers and so on")
print("Potentially between 25-31 if we add hybrids: between CB-FB*2, CB-DM, AM-AW*2, AM-ST*2, AW-ST*2")
print("\n=== RECOMMENDED APPROACH ===")
print("Consequenlty we try to find the best number of cluster counts between 17-22 and 26-31")
results = {}
for n_clusters in cluster_counts:
# Apply clustering
if method == 'kmeans':
clustering = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
elif method == 'agglomerative':
clustering = AgglomerativeClustering(n_clusters=n_clusters)
elif method == 'gmm':
clustering = GaussianMixture(n_components=n_clusters, random_state=42)
# Get cluster labels
cluster_labels = clustering.fit_predict(data)
# Calculate metrics
silhouette = silhouette_score(data, cluster_labels)
ch_score = calinski_harabasz_score(data, cluster_labels)
# Store results
results[n_clusters] = {
'silhouette': silhouette,
'ch_score': ch_score,
'labels': cluster_labels
}
# Get cluster sizes
unique, counts = np.unique(cluster_labels, return_counts=True)
cluster_sizes = dict(zip(unique, counts))
results[n_clusters]['cluster_sizes'] = cluster_sizes
# Compare metrics across cluster counts
plt.figure(figsize=(10, 6))
plt.plot(results.keys(), [r['silhouette'] for r in results.values()], 'bo-', label='Silhouette Score')
plt.grid(True)
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score Comparison')
plt.legend()
plt.show()
return results
In [40]:
#Define function to analyze clusters
def analyze_clusters(data, player_info_df, n_clusters, method='kmeans'):
"""
Create clusters and analyze top representative players for each cluster
Args:
data: UMAP-transformed data
player_info_df: DataFrame with player information (should include 'player_name' column)
n_clusters: Number of clusters to create
method: Clustering method ('kmeans', 'agglomerative', or 'gmm')
Returns:
DataFrame with cluster assignments and cluster analysis information
"""
print(f"Creating {n_clusters} clusters using {method}...")
# Apply clustering
if method == 'kmeans':
clustering = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
elif method == 'agglomerative':
clustering = AgglomerativeClustering(n_clusters=n_clusters)
elif method == 'gmm':
clustering = GaussianMixture(n_components=n_clusters, random_state=42)
else:
raise ValueError("Method must be 'kmeans', 'agglomerative', or 'gmm'")
# Get cluster labels
cluster_labels = clustering.fit_predict(data)
# Add cluster labels to player info
player_info_df = player_info_df.copy()
player_info_df['cluster'] = cluster_labels
# Visualize clusters
plt.figure(figsize=(16, 12))
scatter = plt.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis', s=3, alpha=0.7)
# Add cluster centers if using KMeans
centers = None
if method == 'kmeans':
centers = clustering.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='#000000', s=200, alpha=0.5, marker='X', edgecolors='#FFFFFF')
plt.title(f'Player Clusters using {method.capitalize()} (n={n_clusters})')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.colorbar(scatter, label='Cluster')
plt.tight_layout()
plt.show()
# Find representative players for each cluster
cluster_analysis = {}
player_name_col = 'player_name' if 'player_name' in player_info_df.columns else player_info_df.columns[0]
for cluster_id in range(n_clusters):
cluster_members = player_info_df[player_info_df['cluster'] == cluster_id]
# If the data has distance to centroid (for K-means)
if method == 'kmeans' and centers is not None:
# Calculate distance to centroid for all players in this cluster
cluster_data = data[cluster_labels == cluster_id]
centroid = centers[cluster_id]
distances = np.linalg.norm(cluster_data - centroid, axis=1)
# Get indices of players in this cluster
cluster_indices = np.where(cluster_labels == cluster_id)[0]
# Get top 10 closest players to centroid (or fewer if cluster is smaller)
top_n = min(10, len(distances))
closest_indices = cluster_indices[np.argsort(distances)[:top_n]]
representative_players = player_info_df.iloc[closest_indices]
else:
# Just take random 10 players from cluster if not K-means
sample_size = min(10, len(cluster_members))
if sample_size > 0:
representative_players = cluster_members.sample(sample_size)
else:
representative_players = pd.DataFrame()
cluster_analysis[cluster_id] = {
'count': len(cluster_members),
'representative_players': representative_players
}
# Generate template for manual position naming
print("\nTemplate for position naming:")
print("label_mapping = {")
for cluster_id in range(n_clusters):
players = cluster_analysis[cluster_id]['representative_players']
if len(players) > 0:
player_list = ", ".join(players[player_name_col].head(5).tolist())
else:
player_list = "No players in this cluster"
print(f" {cluster_id}: '', # {player_list}")
print("}")
return player_info_df, cluster_analysis
In [41]:
#Create player_df which only players info
player_df = final_df[['player_id', 'player_name', 'team_name', 'season_id']]
In [42]:
comparison_results = compare_cluster_counts(comps, player_df, cluster_counts=[17, 18, 19, 20, 21, 22, 26, 27, 28, 29, 30, 31])
=== FOOTBALL POSITION CONSIDERATIONS === Traditional positions: 10 main positions (GK, CB, FB, WB, MW, AW, DM, CM, AM, ST) With specializations: Between 17-22 specialized roles if we consider using multiple of the same positions, for example left and right wingers and so on Potentially between 25-31 if we add hybrids: between CB-FB*2, CB-DM, AM-AW*2, AM-ST*2, AW-ST*2 === RECOMMENDED APPROACH === Consequenlty we try to find the best number of cluster counts between 17-22 and 26-31
In [43]:
#Run cluster analysis with chosen number of clusters
n_clusters = 22 # Change this to your desired number I prefer more granularity to allow a bit more interpretation on my side given the minor difference in score
player_df_with_clusters, cluster_info = analyze_clusters(comps, player_df, n_clusters, method='agglomerative')
Creating 22 clusters using agglomerative...
Template for position naming: label_mapping = { 0: '', # Stefan Ortega, Steve Arnold, Spencer Richey, Elías Ólafsson, Seny Dieng 1: '', # Tristan Crama, Marvin Loría, Andreas Voglsammer, Niclas Eliasson, Jamie Leweling 2: '', # Hannes Wolf, Calvin Harris, Maximilian Philipp, Michael Baidoo, Aaron Drinan 3: '', # Lennart Thy, Jamiro Monteiro, Matteo Pessina, Wilder Cartagena, Lewis Fiorini 4: '', # Hasan Kaldirim, Shane Ferguson, Yuto Nagatomo, Aboubakary Koïta, Richard Tait 5: '', # Jayden Nelson, Nene Dorgeles, Andrey Egorychev, Rochinha, Juan Gauto 6: '', # Gerard Piqué, Damon Mirani, Paul Huntington, Cameron Humphreys, Gerzino Nyamsi 7: '', # Serdar Gürler, Griffin Yow, Rachid Ghezzal, Alex Iwobi, Geoffry Hairemans 8: '', # Ricardo Esgaio, Boyd Lucassen, Mustafa Eskihellaç, Mattias Johansson, Odilon Kossounou 9: '', # Show, Mattéo Guendouzi, Lamine Camara, Iván Marcone, Richard 10: '', # Rade Krunic, Adrián Bernabé, Regan Slater, Chris Durkin, Adalberto Carrasquilla 11: '', # Rafael Cabral, Cristian Balgradean, Yehvann Diouf, Dan Bentley, Gonzalo Marinelli 12: '', # Serdar Aziz, Oleg Kozhemyakin, Ismaël Traoré, Ruslan Litvinov, Rein Van Helden 13: '', # Kenneth Paal, Ryan Manning, Dimitri Liénard, Emiliano Papa, Derrick Köhn 14: '', # Jeff Hendrick, Marlon Hairston, Frank Onyeka, Dmitri Tsypchenko, Jay Emmanuel-Thomas 15: '', # Cédric Kipré, Ben Barclay, Martin Hongla, Jon Martín, Danny Batth 16: '', # Lukas Nmecha, Steve Mounié, Mathias De Amorim, Gabriel Jesus, Eric Bicfalvi 17: '', # Kirill Suslov, Ed Turns, Malang Sarr, Derrick Williams, Igor Julio 18: '', # Duncan Watmore, Antonio Sanabria, Janik Haberer, Jeremy Ebobisse, Kike Pérez 19: '', # Macaulay Gillesphey, Tristan Blackmon, Stjepan Radeljic, Tomás Cardona, Francesco Acerbi 20: '', # Weston McKennie, Aleksey Mironov, Daniel Bragança, Matheus Henrique, Kristijan Bistrovic 21: '', # Luca Oyen, Horacio Tijanovich, Andreas Voglsammer, Amara Baby, Fábio Silva }
In [ ]:
In [ ]:
In [ ]:
In [147]:
#Creating list of players in a specific cluster
C = player_df_with_clusters[player_df_with_clusters['cluster'] == 18]
clist = C.player_name.unique()
In [148]:
#Visualizing heatmap of players in the cluster for interpretation
from mplsoccer import Pitch, VerticalPitch, lines
from matplotlib.colors import LinearSegmentedColormap
dfx = df0[df0['player_name'].isin(clist)]
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=1.25, line_color='#000000',
line_zorder=2, pitch_color='#D7D1CF')
fig, axs = pitch.grid(endnote_height=0.03, endnote_space=0, grid_width=0.88, left=0.025, title_height=0.06, title_space=0, axis=False,
grid_height=0.86)
fig.set_facecolor('#D7D1CF')
# Define the colormap with #D7D1CF as the lowest value
cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#D7D1CF', '#FFFFFF', '#FFFF80', '#FF8000', '#800000', '#000000'], N=256)
bin_statisticplayer = pitch.bin_statistic(dfx.start_x_a0, dfx.start_y_a0, statistic='count', bins=(52.5, 34))
pcm = pitch.heatmap(bin_statisticplayer, ax=axs['pitch'], cmap=cmap)
In [149]:
#Watching the list of players in the cluster that play in certain teams for interpretation
X0 = C[C['team_name'].isin(['Juventus', 'Man City', 'Barcelona', 'Atalanta', 'Liverpool', 'Inter', 'AC Milan', 'Real Madrid',
'Arsenal', 'Chelsea', 'Bayern Munich', 'Man Utd', 'PSG', 'Borussia Dortmund'])]
In [150]:
X0.head(60)
Out[150]:
player_id | player_name | team_name | season_id | cluster | |
---|---|---|---|---|---|
12 | 3281.0 | Zlatan Ibrahimovic | AC Milan | 2021.0 | 18 |
13 | 3281.0 | Zlatan Ibrahimovic | AC Milan | 2122.0 | 18 |
50 | 5583.0 | Cristiano Ronaldo | Man Utd | 2223.0 | 18 |
1161 | 24328.0 | Edinson Cavani | Man Utd | 2021.0 | 18 |
1162 | 24328.0 | Edinson Cavani | Man Utd | 2122.0 | 18 |
1168 | 24444.0 | Olivier Giroud | AC Milan | 2122.0 | 18 |
1171 | 24444.0 | Olivier Giroud | Chelsea | 2021.0 | 18 |
1280 | 25605.0 | Anthony Modeste | Borussia Dortmund | 2223.0 | 18 |
1910 | 33404.0 | Eden Hazard | Real Madrid | 2122.0 | 18 |
2103 | 34693.0 | Marko Arnautovic | Inter | 2324.0 | 18 |
2104 | 34693.0 | Marko Arnautovic | Inter | 2425.0 | 18 |
2622 | 41073.0 | Duván Zapata | Atalanta | 2223.0 | 18 |
2889 | 44120.0 | Pierre-Emerick Aubameyang | Chelsea | 2223.0 | 18 |
3365 | 68585.0 | Luuk de Jong | Barcelona | 2122.0 | 18 |
3918 | 73078.0 | Alexandre Lacazette | Arsenal | 2122.0 | 18 |
3964 | 73494.0 | Martin Braithwaite | Barcelona | 2021.0 | 18 |
4086 | 74603.0 | Joselu | Real Madrid | 2324.0 | 18 |
4456 | 78498.0 | Romelu Lukaku | Chelsea | 2122.0 | 18 |
5248 | 84190.0 | Luis Muriel | Atalanta | 2223.0 | 18 |
5249 | 84190.0 | Luis Muriel | Atalanta | 2324.0 | 18 |
5926 | 91213.0 | Álvaro Morata | AC Milan | 2425.0 | 18 |
6483 | 96182.0 | Roberto Firmino | Liverpool | 2122.0 | 18 |
6511 | 96449.0 | Julian Draxler | PSG | 2122.0 | 18 |
7039 | 101735.0 | Ante Rebic | AC Milan | 2223.0 | 18 |
7259 | 104257.0 | Arkadiusz Milik | Juventus | 2223.0 | 18 |
7260 | 104257.0 | Arkadiusz Milik | Juventus | 2324.0 | 18 |
7645 | 106964.0 | Mauro Icardi | PSG | 2122.0 | 18 |
9789 | 122366.0 | Anthony Martial | Man Utd | 2223.0 | 18 |
10037 | 124688.0 | Divock Origi | AC Milan | 2223.0 | 18 |
10208 | 125378.0 | Joaquín Correa | Inter | 2223.0 | 18 |
10854 | 129354.0 | Aleksey Miranchuk | Atalanta | 2021.0 | 18 |
12850 | 137467.0 | Marco Asensio | PSG | 2425.0 | 18 |
14873 | 235755.0 | Diogo Jota | Liverpool | 2223.0 | 18 |
14874 | 235755.0 | Diogo Jota | Liverpool | 2324.0 | 18 |
14875 | 235755.0 | Diogo Jota | Liverpool | 2425.0 | 18 |
14974 | 236544.0 | Sébastien Haller | Borussia Dortmund | 2223.0 | 18 |
15082 | 238916.0 | Mikel Merino | Arsenal | 2425.0 | 18 |
16954 | 279379.0 | Gabriel Jesus | Arsenal | 2223.0 | 18 |
16955 | 279379.0 | Gabriel Jesus | Arsenal | 2324.0 | 18 |
16956 | 279379.0 | Gabriel Jesus | Arsenal | 2425.0 | 18 |
18673 | 299451.0 | Ademola Lookman | Atalanta | 2223.0 | 18 |
18908 | 300945.0 | Christopher Nkunku | Chelsea | 2324.0 | 18 |
19161 | 302692.0 | Christian Pulisic | Chelsea | 2122.0 | 18 |
20103 | 315227.0 | Erling Haaland | Man City | 2324.0 | 18 |
20154 | 315369.0 | Dusan Vlahovic | Juventus | 2122.0 | 18 |
20221 | 315755.0 | Luka Jovic | AC Milan | 2324.0 | 18 |
20390 | 317506.0 | Tammy Abraham | AC Milan | 2425.0 | 18 |
20391 | 317506.0 | Tammy Abraham | Chelsea | 2021.0 | 18 |
22250 | 328990.0 | Moise Kean | Juventus | 2223.0 | 18 |
22987 | 333542.0 | Randal Kolo Muani | PSG | 2425.0 | 18 |
24323 | 343346.0 | Mason Mount | Man Utd | 2324.0 | 18 |
24934 | 345845.0 | Eddie Nketiah | Arsenal | 2223.0 | 18 |
24935 | 345845.0 | Eddie Nketiah | Arsenal | 2324.0 | 18 |
25266 | 349207.0 | Rafael Leão | AC Milan | 2021.0 | 18 |
25715 | 352825.0 | Cody Gakpo | Liverpool | 2223.0 | 18 |
26613 | 358801.0 | Mehdi Taremi | Inter | 2425.0 | 18 |
27455 | 362431.0 | João Félix | AC Milan | 2425.0 | 18 |
27461 | 362431.0 | João Félix | Chelsea | 2425.0 | 18 |
28047 | 363686.0 | Emile Smith Rowe | Arsenal | 2021.0 | 18 |
28414 | 365409.0 | Julián Álvarez | Man City | 2223.0 | 18 |
In [ ]:
In [ ]:
In [ ]:
In [151]:
# Define the mapping of old labels to new labels given my interpretation of the heatmaps
label_mapping = {
0: 'GK',
1: 'RWB',
2: 'SS', #Shadow Striker
3: 'LCM',
4: 'LWB',
5: 'AWL',
6: 'CB',
7: 'AWR',
8: 'RWB',
9: 'DM',
10: 'CM',
11: 'GK',
12: 'RCB',
13: 'LWB',
14: 'CM',
15: 'RCB',
16: 'ST',
17: 'LCB',
18: 'ST',
19: 'LCB',
20: 'DM',
21: 'AWL'
}
# Map the labels directly
player_df_with_clusters['cluster'] = player_df_with_clusters['cluster'].map(label_mapping)
In [152]:
#Creating the final table with players and positions
position = player_df_with_clusters.groupby(["player_name", "player_id", "team_name", "season_id"], observed=True)['cluster'].unique().reset_index(name='position')
position['position'] = position['position'].astype(str)
position['position'] = position['position'].str.strip("['")
position['position'] = position['position'].str.strip("']")
In [153]:
#Saving the table
position.to_csv("clustered_position.csv")
In [154]:
#Showing the table
position
Out[154]:
player_name | player_id | team_name | season_id | position | |
---|---|---|---|---|---|
0 | AJ Delagarza JR | 72638.0 | Inter Miami CF | 2020.0 | RWB |
1 | AJ Delagarza JR | 72638.0 | New England | 2021.0 | RCB |
2 | Aapo Halme | 202262.0 | Barnsley | 2021.0 | SS |
3 | Aaron Appindangoyé | 139387.0 | Sivasspor | 2021.0 | RCB |
4 | Aaron Appindangoyé | 139387.0 | Sivasspor | 2223.0 | RCB |
... | ... | ... | ... | ... | ... |
33401 | Ørjan Nyland | 39187.0 | Reading | 2122.0 | GK |
33402 | Ørjan Nyland | 39187.0 | Sevilla | 2324.0 | GK |
33403 | Ørjan Nyland | 39187.0 | Sevilla | 2425.0 | GK |
33404 | Úmaro Embaló | 455568.0 | Fortuna Sittard | 2223.0 | AWL |
33405 | Úmaro Embaló | 455568.0 | Rio Ave | 2324.0 | LWB |
33406 rows × 5 columns
In [ ]: