# Import necessary libraries for data manipulation, visualization, and analysis
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name

# Load datasets from CSV files
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
xG = pd.read_csv("xGactions.csv", index_col = 0)

players0 = pd.read_csv("players2021.csv", index_col = 0)
players1 = pd.read_csv("players2122.csv", index_col = 0)
players2 = pd.read_csv("players2223.csv", index_col = 0)
players3 = pd.read_csv("players2324.csv", index_col = 0)
players4 = pd.read_csv("players2425.csv", index_col = 0)

games0 = pd.read_csv("games2021.csv", index_col = 0)
games1 = pd.read_csv("games2122.csv", index_col = 0)
games2 = pd.read_csv("games2223.csv", index_col = 0)
games3 = pd.read_csv("games2324.csv", index_col = 0)
games4 = pd.read_csv("games2425.csv", index_col = 0)

actions0 = pd.read_csv("actions2021.csv", index_col = 0)
actions1 = pd.read_csv("actions2122.csv", index_col = 0)
actions2 = pd.read_csv("actions2223.csv", index_col = 0)
actions3 = pd.read_csv("actions2324.csv", index_col = 0)
actions4 = pd.read_csv("actions2425.csv", index_col = 0)

#Concatenate all the necessary files
players = pd.concat([players0, players1, players2, players3, players4])
games = pd.concat([games0, games1, games2, games3, games4])
actions = pd.concat([actions0, actions1, actions2, actions3, actions4])

#Resetting the action_ids
actions.drop(columns=['action_id'], inplace=True)
actions.reset_index(drop=True, inplace=True)
actions.reset_index(inplace=True)
actions.rename(columns={'index': 'action_id'}, inplace=True)

# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to minutes (seconds converted to fraction of minutes)
        return minutes + seconds / 60
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)

#Adding infos for the event data file
atomic = spadl.add_names(actions)

#Merging the xG values
atomic = atomic.merge(xG, how="left")

#Merging players infos with mapping of teams id between fotmob and whoscored
players = players.merge(fb, how="left")

#Creating the total of minutes played for players
mp0 = players.groupby(["player_name", "player_id", "team_name", "fotmob_id", "season_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')

#Merging events with players infos after elaborations
df0 = atomic.merge(players, how='left')

#Creating columns with selected infos from previosu rows
df0["prev_type_name"] = df0.shift(+1, fill_value=0)["type_name"]
df0["prev_team_name"] = df0.shift(+1, fill_value=0)["team_name"]
df0["prev_fotmob_id"] = df0.shift(+1, fill_value=0)["fotmob_id"]
df0["prev_player_name"] = df0.shift(+1, fill_value=0)["player_name"]
df0["prev_player_id"] = df0.shift(+1, fill_value=0)["player_id"]

#Selecting event types I want to work on
df0 = df0[df0['type_name'].isin(['pass', 'dribble', 'interception', 'clearance', 'take_on', 'tackle', 'shot', 'bad_touch',
                                 'keeper_pick_up', 'foul', 'keeper_save', 'cross', 'keeper_claim', 'goal', 'keeper_punch'])]

#Finding shots relative to team frequency
shots0 = df0[df0["type_name"].isin(['shot', 'goal'])]
shotsa = shots0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='shots')
shotsb = shots0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_shots')
shots = shotsa.merge(shotsb)
shots['shots_frequency'] = (shots['shots'] / shots['team_shots']) * 100
shots = shots[['player_name', 'player_id', 'team_name', 'season_id', 'shots_frequency']]

#Finding npxG relative to team frequency
npxGa = shots0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['xG'].sum().reset_index(name='npxG')
npxGb = shots0.groupby(['team_name', 'season_id'], observed=True)['xG'].sum().reset_index(name='team_npxG')
npxG = npxGa.merge(npxGb)
npxG['npxG_frequency'] = (npxG['npxG'] / npxG['team_npxG']) * 100
npxG = npxG[['player_name', 'player_id', 'team_name', 'season_id', 'npxG_frequency']]

#Finding goalkeeping actions relative to team frequency (not so useful probably)
gk0 = df0[df0["type_name"].isin(['keeper_pick_up', 'keeper_save', 'keeper_claim', 'keeper_punch'])]
gka = gk0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='gk_actions')
gkb = gk0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_gk_actions')
gk = gka.merge(gkb)
gk['gk_actions_frequency'] = (gk['gk_actions'] / gk['team_gk_actions']) * 100
gk = gk[['player_name', 'player_id', 'team_name', 'season_id', 'gk_actions_frequency']]

#Finding carries/dribbles and take ons relative to team frequency
drb0 = df0[df0["type_name"].isin(['dribble', 'take_on'])]
drba = drb0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='dribbles')
drbb = drb0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_dribbles')
drb = drba.merge(drbb)
drb['dribbles_frequency'] = (drb['dribbles'] / drb['team_dribbles']) * 100
drb = drb[['player_name', 'player_id', 'team_name', 'season_id', 'dribbles_frequency']]

#Finding crosses relative to team frequency
crss0 = df0[df0["type_name"].isin(['cross'])]
crssa = crss0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='crosses')
crssb = crss0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_crosses')
crss = crssa.merge(crssb)
crss['crosses_frequency'] = (crss['crosses'] / crss['team_crosses']) * 100
crss = crss[['player_name', 'player_id', 'team_name', 'season_id', 'crosses_frequency']]

#Finding defensive actions relative to team frequency
def0 = df0[df0["type_name"].isin(['foul', 'interception', 'clearance', 'tackle'])]
def_actionsa = def0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='def_actions')
def_actionsb = def0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_def_actions')
def_actions = def_actionsa.merge(def_actionsb)
def_actions['def_actions_frequency'] = (def_actions['def_actions'] / def_actions['team_def_actions']) * 100
def_actions = def_actions[['player_name', 'player_id', 'team_name', 'season_id', 'def_actions_frequency']]

#Finding all actions relative to team frequency
actions_adja = df0.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='actions_adj')
actions_adjb = df0.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_actions_adj')
actions_adj = actions_adja.merge(actions_adjb)
actions_adj['actions_adj_frequency'] = (actions_adj['actions_adj'] / actions_adj['team_actions_adj']) * 100
actions_adj = actions_adj[['player_name', 'player_id', 'team_name', 'season_id', 'actions_adj_frequency']]

#Finding non penalty expected goals assisted relative to team frequency
shots1 = shots0[shots0["team_name"] == shots0['prev_team_name']]
shots2 = shots1[shots1["player_id"] != shots1['prev_player_id']]
shots3 = shots2[shots2["type_name"].isin(['pass', 'cross', 'shot', 'interception', 'tackle', 'bad_touch', 'take_on', 'dribble', 'clearance'])]
npxAa = shots3.groupby(["prev_player_name", "prev_player_id", "prev_team_name", "season_id"], observed=True)['xG'].sum().reset_index(name='npxA')
npxAb = shots3.groupby(["prev_team_name", "season_id"], observed=True)['xG'].sum().reset_index(name='team_npxA')
npxA = npxAa.merge(npxAb)
npxA['npxA_frequency'] = (npxA['npxA'] / npxA['team_npxA']) * 100
npxA = npxA[["prev_player_name", "prev_player_id", "prev_team_name", 'season_id', 'npxA_frequency']]

#Renaming non penalty expected goals assisted columns for merging
npxA = npxA.rename(columns={
    "prev_player_name": "player_name",
    "prev_player_id": "player_id",
    "prev_team_name": "team_name",
    "prev_fotmob_id": "fotmob_id",
    "prev_type_name": "type_name"
})

#Finding actions in the box relative to team frequency
dfx0 = df0[df0['start_y_a0'] >= 13.885]
dfx1 = dfx0[dfx0['start_y_a0'] <= 54.115]
dfx = dfx1[dfx1['start_x_a0'] >= 88.5]
boxa = dfx.groupby(['player_name', 'player_id', 'team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='box')
boxb = dfx.groupby(['team_name', 'season_id'], observed=True)['type_name'].count().reset_index(name='team_box')
box = boxa.merge(boxb)
box['box_frequency'] = (box['box'] / box['team_box']) * 100
box = box[['player_name', 'player_id', 'team_name', 'season_id', 'box_frequency']]

#Merging all the tables created to create the main one
mp = (mp0
       .merge(actions_adj, how="left")
       .merge(def_actions, how="left")
       .merge(crss, how="left")
       .merge(drb, how="left")
       .merge(shots, how="left")
       .merge(npxG, how="left")
       .merge(npxA, how="left")
       .merge(box, how="left")
       .merge(gk, how="left"))

#We fill cells where there are NaN values with 0
mp = mp.fillna(0)

# Define the number of zones of the pitch we want to count how many actions a player did inside of them
length_zones = 52.5
width_zones = 34

# Define the size of each zone
zone_length = 105 / length_zones
zone_width = 68 / width_zones

# Assign each event to a zone
df0['zone_x'] = (df0['start_x_a0'] // zone_length).astype(int)
df0['zone_y'] = (df0['start_y_a0'] // zone_width).astype(int)

# Create a single column representing the zone (optional, for grouping)
df0['zone'] = df0['zone_x'].astype(str) + '-' + df0['zone_y'].astype(str)

# Group by game, player, and zone and count the events
zone_counts = (
    df0.groupby(['player_id', 'player_name', 'team_name', 'season_id', 'zone_x', 'zone_y'])
    .size()
    .reset_index(name='event_count')
)

# Pivot the data to create one column per zone
pivoted_counts = zone_counts.pivot_table(
    index=['player_id', 'player_name', 'team_name', 'season_id'],  # Rows are game_id and player_id
    columns=['zone_x', 'zone_y'],   # Columns are zones
    values='event_count',          # Values are the counts
    fill_value=0                   # Fill missing zones with 0
)

# Flatten the MultiIndex columns for better readability
pivoted_counts.columns = [f'zone_{col[0]}_{col[1]}' for col in pivoted_counts.columns]

# Reset index to make game_id and player_id regular columns
pivoted_counts.reset_index(inplace=True)

# Group by game, player, and zone and count the events
team_zone_counts = (
    df0.groupby(['team_name', 'season_id', 'zone_x', 'zone_y'])
    .size()
    .reset_index(name='event_count')
)

# Pivot the data to create one column per zone
team_counts = team_zone_counts.pivot_table(
    index=['team_name', 'season_id'],  # Rows are game_id and player_id
    columns=['zone_x', 'zone_y'],   # Columns are zones
    values='event_count',          # Values are the counts
    fill_value=0                   # Fill missing zones with 0
)

# Flatten the MultiIndex columns for better readability
team_counts.columns = [f'zone_{col[0]}_{col[1]}' for col in team_counts.columns]

# Reset index to make game_id and player_id regular columns
team_counts.reset_index(inplace=True)

# Identify zone columns
zone_columns = [col for col in pivoted_counts.columns if col.startswith('zone_')]

# Merge on team_name and season_id
merged_df = pd.merge(pivoted_counts, team_counts, on=['team_name', 'season_id'], suffixes=('_player', '_team'))

# Normalize player zones by team total into the same zones, but keep 0 where player zones are 0
for zone in zone_columns:
    merged_df[zone] = np.where(
        merged_df[f"{zone}_player"] == 0,  # Condition: If player zone count is 0
        0,  # Keep 0 where player has 0 touches
        merged_df[f"{zone}_player"] / merged_df[f"{zone}_team"]  # Otherwise, perform division
    )

final_df0 = merged_df[['player_id', 'player_name', 'team_name', 'season_id'] + zone_columns]

#Merging the dataframe with number of actions in every zone with the metrics table
final_df = final_df0.merge(mp, how="left")
final_df = final_df.fillna(0)

#Keeping only players with at least 500 minutes played
final_df = final_df[final_df['minutes_played'] >= 500]

#Selecting columns to base the cluster on
columns_to_select = final_df.columns[4:]
result = list(columns_to_select[:-10]) + list(columns_to_select[-9:])

# Apply selection to the dataframe
X = final_df[result].values

#Importing clustering packages
from sklearn.preprocessing import StandardScaler
from umap import UMAP

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#We watch how many points we have so to consider parameters 
len(X_scaled)

33406

# Apply UMAP to reduce the dimensions further
umap = UMAP(min_dist=0.0, n_neighbors=1500, random_state=2213)
comps = umap.fit_transform(X_scaled)

/Users/davidegualano/anaconda3/envs/UMAP/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.

# Plot the reduced dimensions
fig, ax = plt.subplots(figsize=(16, 12))
scatter = ax.scatter(comps[:, 0], comps[:, 1], c='red', s=3)

# Set labels
ax.set_xlabel('UMAP Component 1')
ax.set_ylabel('UMAP Component 2')

# Show plot
plt.show()

#Import necessary libraries to finish the clustering
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.mixture import GaussianMixture
from kneed import KneeLocator

#Define function to find optimal number of clusters
def compare_cluster_counts(data, player_info_df, cluster_counts=[11, 15, 20], method='kmeans'):
    """
    Compare multiple cluster counts to help determine the most appropriate number
    
    Args:
        data: UMAP-transformed data
        player_info_df: DataFrame with player information
        cluster_counts: List of cluster counts to compare
        method: Clustering method ('kmeans', 'agglomerative', or 'gmm')
    """

    # Domain knowledge considerations
    print("\n=== FOOTBALL POSITION CONSIDERATIONS ===")
    print("Traditional positions: 10 main positions (GK, CB, FB, WB, MW, AW, DM, CM, AM, ST)")
    print("With specializations: Between 17-22 specialized roles if we consider using multiple of the same positions, for example left and right wingers and so on")
    print("Potentially between 25-31 if we add hybrids: between CB-FB*2, CB-DM, AM-AW*2, AM-ST*2, AW-ST*2")
    
    print("\n=== RECOMMENDED APPROACH ===")
    print("Consequenlty we try to find the best number of cluster counts between 17-22 and 26-31")
    
    results = {}
    
    for n_clusters in cluster_counts:
        # Apply clustering
        if method == 'kmeans':
            clustering = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        elif method == 'agglomerative':
            clustering = AgglomerativeClustering(n_clusters=n_clusters)
        elif method == 'gmm':
            clustering = GaussianMixture(n_components=n_clusters, random_state=42)
        
        # Get cluster labels
        cluster_labels = clustering.fit_predict(data)
        
        # Calculate metrics
        silhouette = silhouette_score(data, cluster_labels)
        ch_score = calinski_harabasz_score(data, cluster_labels)
        
        # Store results
        results[n_clusters] = {
            'silhouette': silhouette,
            'ch_score': ch_score,
            'labels': cluster_labels
        }
        
        # Get cluster sizes
        unique, counts = np.unique(cluster_labels, return_counts=True)
        cluster_sizes = dict(zip(unique, counts))
        results[n_clusters]['cluster_sizes'] = cluster_sizes
    
    # Compare metrics across cluster counts
    plt.figure(figsize=(10, 6))
    plt.plot(results.keys(), [r['silhouette'] for r in results.values()], 'bo-', label='Silhouette Score')
    plt.grid(True)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score Comparison')
    plt.legend()
    plt.show()
    
    return results

#Define function to analyze clusters
def analyze_clusters(data, player_info_df, n_clusters, method='kmeans'):
    """
    Create clusters and analyze top representative players for each cluster
    
    Args:
        data: UMAP-transformed data
        player_info_df: DataFrame with player information (should include 'player_name' column)
        n_clusters: Number of clusters to create
        method: Clustering method ('kmeans', 'agglomerative', or 'gmm')
    
    Returns:
        DataFrame with cluster assignments and cluster analysis information
    """
    print(f"Creating {n_clusters} clusters using {method}...")
    
    # Apply clustering
    if method == 'kmeans':
        clustering = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    elif method == 'agglomerative':
        clustering = AgglomerativeClustering(n_clusters=n_clusters)
    elif method == 'gmm':
        clustering = GaussianMixture(n_components=n_clusters, random_state=42)
    else:
        raise ValueError("Method must be 'kmeans', 'agglomerative', or 'gmm'")
    
    # Get cluster labels
    cluster_labels = clustering.fit_predict(data)
    
    # Add cluster labels to player info
    player_info_df = player_info_df.copy()
    player_info_df['cluster'] = cluster_labels
    
    # Visualize clusters
    plt.figure(figsize=(16, 12))
    scatter = plt.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis', s=3, alpha=0.7)
    
    # Add cluster centers if using KMeans
    centers = None
    if method == 'kmeans':
        centers = clustering.cluster_centers_
        plt.scatter(centers[:, 0], centers[:, 1], c='#000000', s=200, alpha=0.5, marker='X', edgecolors='#FFFFFF')
    
    plt.title(f'Player Clusters using {method.capitalize()} (n={n_clusters})')
    plt.xlabel('UMAP Component 1')
    plt.ylabel('UMAP Component 2')
    plt.colorbar(scatter, label='Cluster')
    plt.tight_layout()
    plt.show()
    
    # Find representative players for each cluster
    cluster_analysis = {}
    player_name_col = 'player_name' if 'player_name' in player_info_df.columns else player_info_df.columns[0]
    
    for cluster_id in range(n_clusters):
        cluster_members = player_info_df[player_info_df['cluster'] == cluster_id]
        
        # If the data has distance to centroid (for K-means)
        if method == 'kmeans' and centers is not None:
            # Calculate distance to centroid for all players in this cluster
            cluster_data = data[cluster_labels == cluster_id]
            centroid = centers[cluster_id]
            distances = np.linalg.norm(cluster_data - centroid, axis=1)
            
            # Get indices of players in this cluster
            cluster_indices = np.where(cluster_labels == cluster_id)[0]
            
            # Get top 10 closest players to centroid (or fewer if cluster is smaller)
            top_n = min(10, len(distances))
            closest_indices = cluster_indices[np.argsort(distances)[:top_n]]
            representative_players = player_info_df.iloc[closest_indices]
            
        else:
            # Just take random 10 players from cluster if not K-means
            sample_size = min(10, len(cluster_members))
            if sample_size > 0:
                representative_players = cluster_members.sample(sample_size)
            else:
                representative_players = pd.DataFrame()
        
        cluster_analysis[cluster_id] = {
            'count': len(cluster_members),
            'representative_players': representative_players
        }
    
    # Generate template for manual position naming
    print("\nTemplate for position naming:")
    print("label_mapping = {")
    for cluster_id in range(n_clusters):
        players = cluster_analysis[cluster_id]['representative_players']
        if len(players) > 0:
            player_list = ", ".join(players[player_name_col].head(5).tolist())
        else:
            player_list = "No players in this cluster"
        print(f"    {cluster_id}: '',  # {player_list}")
    print("}")
    
    return player_info_df, cluster_analysis

#Create player_df which only players info
player_df = final_df[['player_id', 'player_name', 'team_name', 'season_id']]

comparison_results = compare_cluster_counts(comps, player_df, cluster_counts=[17, 18, 19, 20, 21, 22, 26, 27, 28, 29, 30, 31])

=== FOOTBALL POSITION CONSIDERATIONS ===
Traditional positions: 10 main positions (GK, CB, FB, WB, MW, AW, DM, CM, AM, ST)
With specializations: Between 17-22 specialized roles if we consider using multiple of the same positions, for example left and right wingers and so on
Potentially between 25-31 if we add hybrids: between CB-FB*2, CB-DM, AM-AW*2, AM-ST*2, AW-ST*2

=== RECOMMENDED APPROACH ===
Consequenlty we try to find the best number of cluster counts between 17-22 and 26-31

#Run cluster analysis with chosen number of clusters
n_clusters = 22  # Change this to your desired number I prefer more granularity to allow a bit more interpretation on my side given the minor difference in score
player_df_with_clusters, cluster_info = analyze_clusters(comps, player_df, n_clusters, method='agglomerative')

Creating 22 clusters using agglomerative...

Template for position naming:
label_mapping = {
    0: '',  # Stefan Ortega, Steve Arnold, Spencer Richey, Elías Ólafsson, Seny Dieng
    1: '',  # Tristan Crama, Marvin Loría, Andreas Voglsammer, Niclas Eliasson, Jamie Leweling
    2: '',  # Hannes Wolf, Calvin Harris, Maximilian Philipp, Michael Baidoo, Aaron Drinan
    3: '',  # Lennart Thy, Jamiro Monteiro, Matteo Pessina, Wilder Cartagena, Lewis Fiorini
    4: '',  # Hasan Kaldirim, Shane Ferguson, Yuto Nagatomo, Aboubakary Koïta, Richard Tait
    5: '',  # Jayden Nelson, Nene Dorgeles, Andrey Egorychev, Rochinha, Juan Gauto
    6: '',  # Gerard Piqué, Damon Mirani, Paul Huntington, Cameron Humphreys, Gerzino Nyamsi
    7: '',  # Serdar Gürler, Griffin Yow, Rachid Ghezzal, Alex Iwobi, Geoffry Hairemans
    8: '',  # Ricardo Esgaio, Boyd Lucassen, Mustafa Eskihellaç, Mattias Johansson, Odilon Kossounou
    9: '',  # Show, Mattéo Guendouzi, Lamine Camara, Iván Marcone, Richard
    10: '',  # Rade Krunic, Adrián Bernabé, Regan Slater, Chris Durkin, Adalberto Carrasquilla
    11: '',  # Rafael Cabral, Cristian Balgradean, Yehvann Diouf, Dan Bentley, Gonzalo Marinelli
    12: '',  # Serdar Aziz, Oleg Kozhemyakin, Ismaël Traoré, Ruslan Litvinov, Rein Van Helden
    13: '',  # Kenneth Paal, Ryan Manning, Dimitri Liénard, Emiliano Papa, Derrick Köhn
    14: '',  # Jeff Hendrick, Marlon Hairston, Frank Onyeka, Dmitri Tsypchenko, Jay Emmanuel-Thomas
    15: '',  # Cédric Kipré, Ben Barclay, Martin Hongla, Jon Martín, Danny Batth
    16: '',  # Lukas Nmecha, Steve Mounié, Mathias De Amorim, Gabriel Jesus, Eric Bicfalvi
    17: '',  # Kirill Suslov, Ed Turns, Malang Sarr, Derrick Williams, Igor Julio
    18: '',  # Duncan Watmore, Antonio Sanabria, Janik Haberer, Jeremy Ebobisse, Kike Pérez
    19: '',  # Macaulay Gillesphey, Tristan Blackmon, Stjepan Radeljic, Tomás Cardona, Francesco Acerbi
    20: '',  # Weston McKennie, Aleksey Mironov, Daniel Bragança, Matheus Henrique, Kristijan Bistrovic
    21: '',  # Luca Oyen, Horacio Tijanovich, Andreas Voglsammer, Amara Baby, Fábio Silva
}

#Creating list of players in a specific cluster
C = player_df_with_clusters[player_df_with_clusters['cluster'] == 18]
clist = C.player_name.unique()

#Visualizing heatmap of players in the cluster for interpretation
from mplsoccer import Pitch, VerticalPitch, lines
from matplotlib.colors import LinearSegmentedColormap
dfx = df0[df0['player_name'].isin(clist)]

pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=1.25, line_color='#000000', 
              line_zorder=2, pitch_color='#D7D1CF')

fig, axs = pitch.grid(endnote_height=0.03, endnote_space=0, grid_width=0.88, left=0.025, title_height=0.06, title_space=0, axis=False,
                      grid_height=0.86)

fig.set_facecolor('#D7D1CF')

# Define the colormap with #D7D1CF as the lowest value
cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#D7D1CF', '#FFFFFF', '#FFFF80', '#FF8000', '#800000', '#000000'], N=256)

bin_statisticplayer = pitch.bin_statistic(dfx.start_x_a0, dfx.start_y_a0, statistic='count', bins=(52.5, 34))

pcm = pitch.heatmap(bin_statisticplayer, ax=axs['pitch'], cmap=cmap)

#Watching the list of players in the cluster that play in certain teams for interpretation
X0 = C[C['team_name'].isin(['Juventus', 'Man City', 'Barcelona', 'Atalanta', 'Liverpool', 'Inter', 'AC Milan', 'Real Madrid', 
                            'Arsenal', 'Chelsea', 'Bayern Munich', 'Man Utd', 'PSG', 'Borussia Dortmund'])]

X0.head(60)

# Define the mapping of old labels to new labels given my interpretation of the heatmaps
label_mapping = {
    0: 'GK',
    1: 'RWB',
    2: 'SS', #Shadow Striker
    3: 'LCM',
    4: 'LWB',
    5: 'AWL',
    6: 'CB',
    7: 'AWR',
    8: 'RWB',
    9: 'DM',
    10: 'CM',
    11: 'GK',
    12: 'RCB',
    13: 'LWB', 
    14: 'CM',
    15: 'RCB',
    16: 'ST',
    17: 'LCB',
    18: 'ST',
    19: 'LCB',
    20: 'DM',
    21: 'AWL'
}

# Map the labels directly
player_df_with_clusters['cluster'] = player_df_with_clusters['cluster'].map(label_mapping)

#Creating the final table with players and positions
position = player_df_with_clusters.groupby(["player_name", "player_id", "team_name", "season_id"], observed=True)['cluster'].unique().reset_index(name='position')
position['position'] = position['position'].astype(str)
position['position'] = position['position'].str.strip("['")
position['position'] = position['position'].str.strip("']")

#Saving the table
position.to_csv("clustered_position.csv")

#Showing the table
position

	player_id	player_name	team_name	season_id	cluster
12	3281.0	Zlatan Ibrahimovic	AC Milan	2021.0	18
13	3281.0	Zlatan Ibrahimovic	AC Milan	2122.0	18
50	5583.0	Cristiano Ronaldo	Man Utd	2223.0	18
1161	24328.0	Edinson Cavani	Man Utd	2021.0	18
1162	24328.0	Edinson Cavani	Man Utd	2122.0	18
1168	24444.0	Olivier Giroud	AC Milan	2122.0	18
1171	24444.0	Olivier Giroud	Chelsea	2021.0	18
1280	25605.0	Anthony Modeste	Borussia Dortmund	2223.0	18
1910	33404.0	Eden Hazard	Real Madrid	2122.0	18
2103	34693.0	Marko Arnautovic	Inter	2324.0	18
2104	34693.0	Marko Arnautovic	Inter	2425.0	18
2622	41073.0	Duván Zapata	Atalanta	2223.0	18
2889	44120.0	Pierre-Emerick Aubameyang	Chelsea	2223.0	18
3365	68585.0	Luuk de Jong	Barcelona	2122.0	18
3918	73078.0	Alexandre Lacazette	Arsenal	2122.0	18
3964	73494.0	Martin Braithwaite	Barcelona	2021.0	18
4086	74603.0	Joselu	Real Madrid	2324.0	18
4456	78498.0	Romelu Lukaku	Chelsea	2122.0	18
5248	84190.0	Luis Muriel	Atalanta	2223.0	18
5249	84190.0	Luis Muriel	Atalanta	2324.0	18
5926	91213.0	Álvaro Morata	AC Milan	2425.0	18
6483	96182.0	Roberto Firmino	Liverpool	2122.0	18
6511	96449.0	Julian Draxler	PSG	2122.0	18
7039	101735.0	Ante Rebic	AC Milan	2223.0	18
7259	104257.0	Arkadiusz Milik	Juventus	2223.0	18
7260	104257.0	Arkadiusz Milik	Juventus	2324.0	18
7645	106964.0	Mauro Icardi	PSG	2122.0	18
9789	122366.0	Anthony Martial	Man Utd	2223.0	18
10037	124688.0	Divock Origi	AC Milan	2223.0	18
10208	125378.0	Joaquín Correa	Inter	2223.0	18
10854	129354.0	Aleksey Miranchuk	Atalanta	2021.0	18
12850	137467.0	Marco Asensio	PSG	2425.0	18
14873	235755.0	Diogo Jota	Liverpool	2223.0	18
14874	235755.0	Diogo Jota	Liverpool	2324.0	18
14875	235755.0	Diogo Jota	Liverpool	2425.0	18
14974	236544.0	Sébastien Haller	Borussia Dortmund	2223.0	18
15082	238916.0	Mikel Merino	Arsenal	2425.0	18
16954	279379.0	Gabriel Jesus	Arsenal	2223.0	18
16955	279379.0	Gabriel Jesus	Arsenal	2324.0	18
16956	279379.0	Gabriel Jesus	Arsenal	2425.0	18
18673	299451.0	Ademola Lookman	Atalanta	2223.0	18
18908	300945.0	Christopher Nkunku	Chelsea	2324.0	18
19161	302692.0	Christian Pulisic	Chelsea	2122.0	18
20103	315227.0	Erling Haaland	Man City	2324.0	18
20154	315369.0	Dusan Vlahovic	Juventus	2122.0	18
20221	315755.0	Luka Jovic	AC Milan	2324.0	18
20390	317506.0	Tammy Abraham	AC Milan	2425.0	18
20391	317506.0	Tammy Abraham	Chelsea	2021.0	18
22250	328990.0	Moise Kean	Juventus	2223.0	18
22987	333542.0	Randal Kolo Muani	PSG	2425.0	18
24323	343346.0	Mason Mount	Man Utd	2324.0	18
24934	345845.0	Eddie Nketiah	Arsenal	2223.0	18
24935	345845.0	Eddie Nketiah	Arsenal	2324.0	18
25266	349207.0	Rafael Leão	AC Milan	2021.0	18
25715	352825.0	Cody Gakpo	Liverpool	2223.0	18
26613	358801.0	Mehdi Taremi	Inter	2425.0	18
27455	362431.0	João Félix	AC Milan	2425.0	18
27461	362431.0	João Félix	Chelsea	2425.0	18
28047	363686.0	Emile Smith Rowe	Arsenal	2021.0	18
28414	365409.0	Julián Álvarez	Man City	2223.0	18

	player_name	player_id	team_name	season_id	position
0	AJ Delagarza JR	72638.0	Inter Miami CF	2020.0	RWB
1	AJ Delagarza JR	72638.0	New England	2021.0	RCB
2	Aapo Halme	202262.0	Barnsley	2021.0	SS
3	Aaron Appindangoyé	139387.0	Sivasspor	2021.0	RCB
4	Aaron Appindangoyé	139387.0	Sivasspor	2223.0	RCB
...	...	...	...	...	...
33401	Ørjan Nyland	39187.0	Reading	2122.0	GK
33402	Ørjan Nyland	39187.0	Sevilla	2324.0	GK
33403	Ørjan Nyland	39187.0	Sevilla	2425.0	GK
33404	Úmaro Embaló	455568.0	Fortuna Sittard	2223.0	AWL
33405	Úmaro Embaló	455568.0	Rio Ave	2324.0	LWB