# Import necessary libraries for data manipulation, visualization, and analysis
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from tqdm.notebook import tqdm
from mplsoccer import Pitch, VerticalPitch, lines
from matplotlib.colors import LinearSegmentedColormap
from scipy.ndimage import gaussian_filter

# Suppress warnings
warnings.filterwarnings("ignore")

# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name

#Load the pre created dataframes and concat them into one
stats0 = pd.read_pickle("bin_statistic2021.pkl")
stats1 = pd.read_pickle("bin_statistic2122.pkl")
stats2 = pd.read_pickle("bin_statistic2223.pkl")
stats3 = pd.read_pickle("bin_statistic2324.pkl")
stats4 = pd.read_pickle("bin_statistic2425.pkl")

player_stats_df = pd.concat([stats0, stats1, stats2, stats3, stats4])

player_stats_df

from sklearn.preprocessing import StandardScaler
from umap import UMAP
from sklearn.mixture import GaussianMixture

# 1. Flatten the bin statistic arrays into feature vectors
print("Preparing features from bin statistics...")
features = []
for idx, row in player_stats_df.iterrows():
    # Flatten the 2D bin statistic array to a 1D feature vector
    flat_array = row['statistic'].flatten()
    features.append(flat_array)

Preparing features from bin statistics...

# 2. Create a numpy array of the features
X = np.array(features)
print(f"Created feature array with shape: {X.shape}")

Created feature array with shape: (50431, 7140)

# 3. Scale the data using StandardScaler
print("Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Scaling features...

# 4. Apply UMAP to reduce dimensions
print("Applying UMAP for dimensionality reduction...")
umap_reducer = UMAP(min_dist=0.1, n_neighbors=50, random_state=2213)
comps = umap_reducer.fit_transform(X_scaled)

Applying UMAP for dimensionality reduction...

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.

# 5. Use K-means which is designed for large datasets
from sklearn.metrics import calinski_harabasz_score

# Function to evaluate clusters quickly without silhouette score
def evaluate_clusters(data, max_k=30, min_k=10):
    ch_scores = []
    k_values = range(min_k, max_k+1)
    
    for k in k_values:
        kmeans = GaussianMixture(n_components=k, random_state=42)
        labels = kmeans.fit_predict(data)
        score = calinski_harabasz_score(data, labels)
        ch_scores.append(score)
    
    # Plot results
    plt.figure(figsize=(16, 6))
    plt.plot(k_values, ch_scores, 'bo-')
    plt.title('Calinski-Harabasz Score vs. Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Calinski-Harabasz Score')
    plt.grid(True)
    plt.show()
    
    return k_values, ch_scores

# Run evaluation with sample data
k_values, scores = evaluate_clusters(comps, max_k=100, min_k=2)

# 6. Store UMAP components in DataFrame
results_df = player_stats_df.copy()
results_df['umap_1'] = comps[:, 0]
results_df['umap_2'] = comps[:, 1]

# 7. Apply clustering if desired
n_clusters = 57  # Adjust as needed
kmeans = GaussianMixture(n_components=n_clusters, random_state=42)
clusters = kmeans.fit_predict(comps)
results_df['cluster'] = clusters

# 8. Plot the UMAP projection with cluster colors
fig, ax = plt.subplots(figsize=(16, 12))

# Create scatter plot colored by cluster
scatter = ax.scatter(
    results_df['umap_1'], 
    results_df['umap_2'], 
    c=results_df['cluster'],
    cmap='tab10',
    s=60,
    alpha=0.7
)

# Add a legend
legend1 = ax.legend(*scatter.legend_elements(),
                    loc="upper right", title="Clusters")
ax.add_artist(legend1)

# Set labels
ax.set_xlabel('UMAP Component 1')
ax.set_ylabel('UMAP Component 2')
ax.set_title('Player Clustering based on Spatial Distribution')

# Add grid
ax.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

#Checking the clusters are there
results_df

def visualize_player_bin_statistic(player_stats_df, player_name, season_id, team_name):
    """
    Visualize a player's bin statistic heatmap on a pitch with correct orientation.
    
    Parameters:
    -----------
    player_stats_df : pandas DataFrame
        DataFrame containing the bin statistics
    player_name : str
        Name of the player to visualize
    """
    # Filter by both player_name
    filtered_df = player_stats_df[(player_stats_df['player_name'] == player_name) & (player_stats_df['team_name'] == team_name)
                                   & (player_stats_df['season_id'] == season_id)]
    
    if filtered_df.empty:
        print(f"No data found for player {player_name}")
        return
    
    # Get the player's bin statistic
    player_row = filtered_df.iloc[0]
    player_bin_stat = player_row['statistic']
    
    # Fix the inverted axes by flipping the bin statistic array both vertically and horizontally
    player_bin_stat = np.flipud(player_bin_stat)  # Fix y-axis
    player_bin_stat = np.fliplr(player_bin_stat)  # Fix x-axis
    
    # Create the pitch object with the same parameters as before
    pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', 
                 linewidth=1.25, line_color='#000000', line_zorder=2, pitch_color='#D7D1CF')
    
    # Create the figure and axes using pitch.grid
    fig, axs = pitch.grid(endnote_height=0.03, endnote_space=0, grid_width=0.88, 
                          left=0.025, title_height=0.06, title_space=0, axis=False,
                          grid_height=0.86)
    
    # Set the figure background color
    fig.set_facecolor('#D7D1CF')
    
    # Define the colormap with #D7D1CF as the lowest value
    cmap = LinearSegmentedColormap.from_list('custom_cmap', 
                                           ['#D7D1CF', '#FFFFFF', '#FFFF80', '#FF8000', '#800000', '#000000'], 
                                           N=256)
    
    # Create the bin statistic again to get the proper grid format
    temp_bin_stat = pitch.bin_statistic(
        [50], [34],  # Dummy values
        statistic='count', 
        bins=(105, 68)
    )
    
    # Replace the statistic with our flipped player statistic
    temp_bin_stat['statistic'] = player_bin_stat
    
    # Create the heatmap
    pcm = pitch.heatmap(temp_bin_stat, ax=axs['pitch'], cmap=cmap)
    
    # Add a title
    axs['pitch'].set_title(f"{player_name} Heatmap ({team_name} {season_id})", fontsize=14)
    
    # Show the plot
    plt.tight_layout()
    plt.show()
    
    return fig, axs

#Creating the dataframe of players from a certain cluster that play for those teams (easier to recognize and categorize their position)
C = results_df[results_df['cluster'] == 55]
X0 = C[C['team_name'].isin(['Juventus', 'Man City', 'Barcelona', 'Atalanta', 'Liverpool', 'Inter', 'AC Milan', 'Real Madrid', 
                            'Arsenal', 'Chelsea', 'Bayern Munich', 'Man Utd', 'PSG', 'Borussia Dortmund'])]

#Checking the list of players
X0

# Example usage of the visualization to know which position we are looking at:
visualize_player_bin_statistic(player_stats_df, 'João Félix', 2425, 'AC Milan')

(<Figure size 1307.75x900 with 3 Axes>,
 {'pitch': <Axes: title={'center': 'João Félix Heatmap (AC Milan 2425)'}>,
  'title': <Axes: >,
  'endnote': <Axes: >})

# Define the mapping of old labels to new labels given my interpretation of the heatmaps
label_mapping = {
    0: 'Undefined',
    1: 'DM',
    2: 'LWB', 
    3: 'GK',
    4: 'RCB',
    5: 'AML',
    6: 'AWR',
    7: 'LCB',
    8: 'DM',
    9: 'RWB',
    10: 'AM',
    11: 'GK',
    12: 'RCB',
    13: 'AWL', 
    14: 'CB',
    15: 'LWB',
    16: 'ST',
    17: 'GK',
    18: 'SS',
    19: 'LCB',
    20: 'AMR',
    21: 'AWR',
    22: 'AMR',
    23: 'Undefined',
    24: 'AWL',
    25: 'HB',
    26: 'LCB',
    27: 'GK',
    28: 'RCB',
    29: 'Undefined', 
    30: 'AM',
    31: 'CB',
    32: 'Undefined',
    33: 'SS',
    34: 'RWB',
    35: 'Undefined',
    36: 'LCB',
    37: 'CM',
    38: 'GK',
    39: 'RCB',
    40: 'CM', 
    41: 'ST',
    42: 'SS',
    43: 'RWB',
    44: 'RCB',
    45: 'DM',
    46: 'AM',
    47: 'CM', 
    48: 'AWR',
    49: 'ST',
    50: 'GK',
    51: 'Undefined',
    52: 'DM',
    53: 'AML',
    54: 'RWB',
    55: 'LWB',
    56: 'LCB'
}

# Map the labels directly
results_df['cluster'] = results_df['cluster'].map(label_mapping)

#Creating the final table with players and positions
position = results_df.groupby(["player_name", "player_id", "team_name", "season_id"], observed=True)['cluster'].unique().reset_index(name='position')
position['position'] = position['position'].astype(str)
position['position'] = position['position'].str.strip("['")
position['position'] = position['position'].str.strip("']")

#Check for unique positions
position.position.unique()

array(['CB', 'RWB', 'CM', 'LCB', 'ST', 'RCB', 'Undefined', 'SS', 'AWL',
       'GK', 'AML', 'LWB', 'DM', 'AMR', 'AWR', 'AM', 'HB'], dtype=object)

# Define the position group function
def map_position_group(pos):
    if pos in ['CB', 'LCB', 'RCB']:
        return 'CB'
    elif pos in ['RWB', 'LWB']:
        return 'WB'
    elif pos in ['CM', 'DM', 'HB']:
        return 'CDM'
    elif pos in ['AWL', 'AML', 'AMR', 'AWR', 'AM']:
        return 'AMW'
    elif pos in ['ST', 'SS']:
        return 'ST'
    elif pos in ['GK']:
        return 'GK'
    else:
        return 'Undefined'

# Create a new column for position groups
position['position_group'] = position['position'].apply(map_position_group)
position.position_group.unique()

array(['CB', 'WB', 'CDM', 'ST', 'Undefined', 'AMW', 'GK'], dtype=object)

#Showing the table
position

#Saving the table
position.to_csv("clustered_position.csv")

	player_id	player_name	team_id	season_id	statistic	team_name
0	335468.0	Nahuel Bustos	893.0	1920.0	[[0.005519596275249199, 0.006888309438353184, ...	Talleres
1	303656.0	Andrés Cubas	893.0	1920.0	[[0.0, 0.0, 0.0, 0.0, 4.442023635054891e-08, 1...	Talleres
2	349761.0	Juan Ignacio Méndez	893.0	1920.0	[[0.0003457032533587284, 0.0005420103833096798...	Talleres
3	330672.0	Facundo Medina	893.0	1920.0	[[0.00013495587774329734, 0.000167285062874743...	Talleres
4	144307.0	Juan Cruz Komar	893.0	1920.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Talleres
...	...	...	...	...	...	...
9847	462097.0	Sota Kitahara	5973.0	2024.0	[[0.0011733655462917167, 0.0011315123881704301...	Seattle
9848	477308.0	Adam Beaudry	1120.0	2024.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Colorado
9849	542900.0	Ervin Torres	29664.0	2024.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Austin FC
9850	328504.0	Emanuel Reynoso	9293.0	2024.0	[[0.0005024170892179867, 0.001031943622745739,...	Minnesota United
9851	512799.0	Cyprian Kachwele	11134.0	2024.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Vancouver

	player_id	player_name	team_id	season_id	statistic	team_name	umap_1	umap_2	cluster
0	335468.0	Nahuel Bustos	893.0	1920.0	[[0.005519596275249199, 0.006888309438353184, ...	Talleres	-0.546561	-1.598811	16
1	303656.0	Andrés Cubas	893.0	1920.0	[[0.0, 0.0, 0.0, 0.0, 4.442023635054891e-08, 1...	Talleres	-2.884587	3.906569	8
2	349761.0	Juan Ignacio Méndez	893.0	1920.0	[[0.0003457032533587284, 0.0005420103833096798...	Talleres	-2.089298	3.745713	45
3	330672.0	Facundo Medina	893.0	1920.0	[[0.00013495587774329734, 0.000167285062874743...	Talleres	6.028358	9.185281	26
4	144307.0	Juan Cruz Komar	893.0	1920.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Talleres	2.460149	9.074166	12
...	...	...	...	...	...	...	...	...	...
9847	462097.0	Sota Kitahara	5973.0	2024.0	[[0.0011733655462917167, 0.0011315123881704301...	Seattle	3.851492	2.358335	32
9848	477308.0	Adam Beaudry	1120.0	2024.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Colorado	11.430593	0.928354	17
9849	542900.0	Ervin Torres	29664.0	2024.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Austin FC	5.299849	0.418871	35
9850	328504.0	Emanuel Reynoso	9293.0	2024.0	[[0.0005024170892179867, 0.001031943622745739,...	Minnesota United	4.226096	0.525148	0
9851	512799.0	Cyprian Kachwele	11134.0	2024.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Vancouver	5.480720	0.415572	35

	player_id	player_name	team_id	season_id	statistic	team_name	umap_1	umap_2	cluster
3945	101955.0	Emerson	15.0	2021.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Chelsea	6.353710	5.213341	55
3965	301455.0	Kostas Tsimikas	26.0	2021.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Liverpool	5.903022	4.637537	55
4035	110260.0	Sead Kolasinac	13.0	2021.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Arsenal	6.572098	5.546768	55
4551	351252.0	Junior Firpo	65.0	2021.0	[[0.0, 0.0, 0.0, 4.1743925055426004e-06, 1.553...	Barcelona	6.356042	5.374319	55
4854	312843.0	Felix Passlack	44.0	2021.0	[[0.00012287498161284947, 0.000158267145004108...	Borussia Dortmund	6.024223	5.189990	55
4915	402046.0	Matteo Ruggeri	300.0	2021.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Atalanta	5.811248	4.294828	55
6477	106894.0	Juan Bernat	304.0	2021.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	PSG	5.829621	4.365310	55
3836	422938.0	Alejandro Balde	65.0	2122.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Barcelona	5.925271	4.543236	55
4439	141726.0	Robin Gosens	300.0	2122.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.3267812...	Atalanta	6.401498	5.169967	55
4451	328968.0	Fodé Ballo-Touré	80.0	2122.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	AC Milan	6.539957	5.467077	55
5295	328968.0	Fodé Ballo-Touré	80.0	2223.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	AC Milan	6.510718	5.443084	55
5256	352830.0	Mitchel Bakker	300.0	2324.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Atalanta	6.426208	5.324160	55
2586	345303.0	Tyrell Malacia	32.0	2425.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Man Utd	6.283732	4.997286	55
2588	297403.0	Kieran Tierney	13.0	2425.0	[[0.0, 0.0, 0.0, 0.0, 6.037287731490949e-06, 2...	Arsenal	6.116987	4.759197	55
5480	450283.0	Jonas Rouhi	87.0	2425.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Juventus	5.869138	4.780038	55

	player_name	player_id	team_name	season_id	position	position_group
0	AJ Delagarza JR	72638.0	Inter Miami CF	2020.0	CB	CB
1	AJ Delagarza JR	72638.0	New England	2021.0	RWB	WB
2	AJ Delagarza JR	72638.0	New England	2022.0	RWB	WB
3	Aapo Halme	202262.0	Barnsley	2021.0	CM	CDM
4	Aapo Halme	202262.0	Barnsley	2122.0	CB	CB
...	...	...	...	...	...	...
50420	Úmaro Embaló	455568.0	Rio Ave	2324.0	LWB	WB
50421	Úmaro Embaló	455568.0	Vitoria de Guimaraes	2425.0	DM	CDM
50422	Übeyd Adiyaman	136467.0	Genclerbirligi	2021.0	GK	GK
50423	Ümit Akdag	438512.0	Alanyaspor	2223.0	Undefined	Undefined
50424	Ümit Akdag	438512.0	Toulouse	2425.0	LCB	CB