# Import necessary libraries for data manipulation, visualization, and analysis
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from tqdm.notebook import tqdm
from mplsoccer import Pitch, VerticalPitch, lines
from matplotlib.colors import LinearSegmentedColormap
from scipy.ndimage import gaussian_filter

# Suppress warnings
warnings.filterwarnings("ignore")

# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name

# Load datasets from CSV files
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)

season = 2425

players = pd.read_csv(f"players{season}.csv", index_col = 0)
games = pd.read_csv(f"games{season}.csv", index_col = 0)
actions = pd.read_csv(f"actions{season}.csv", index_col = 0)

#Adding infos for the event data file
atomic = spadl.add_names(actions)

#Merging players infos with mapping of teams id between fotmob and whoscored
players = players.merge(fb, how="left")

#Conversion of time played format into seconds
def convert_to_seconds(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to seconds (minutes converted to seconds)
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['seconds_played'] = players['minutes_played'].apply(convert_to_seconds)
players['start_second'] = players['start_second'].apply(convert_to_seconds)
players['end_second'] = players['end_second'].apply(convert_to_seconds)

#Keeping only players selected infos
players_info = players[["player_name", "player_id", 'game_id', "team_name", "fotmob_id", 'season_id', 'competition_id', 'seconds_played']]

#Merging events with players infos after elaborations
df0 = atomic.merge(players_info, how='left', on=['game_id', 'player_id'])

mp = players.groupby(['player_id', 'player_name', 'season_id', 'team_id'], observed=True)['seconds_played'].sum().reset_index(name='seconds_played')
mp['minutes_played'] = mp['seconds_played']/60

#Selecting event types I want to work on
df = df0[df0['type_name'].isin(['pass', 'dribble', 'interception', 'clearance', 'take_on', 'tackle', 'shot', 'bad_touch',
                                 'keeper_pick_up', 'foul', 'keeper_save', 'cross', 'keeper_claim', 'goal', 'keeper_punch'])]

df.head()

# Define the worker function at module level (outside any other function)
def process_combination(args):
    """Process a single player-team-season combination"""
    import numpy as np
    from scipy.ndimage import gaussian_filter
    import pandas as pd
    
    combo_idx, combinations_array, df, player_names, player_games_dict, player_game_times_dict = args
    
    # Extract combination details
    player_id, team_id, season_id = combinations_array[combo_idx]
    
    # Get player data efficiently
    dfx = df[(df['player_id'] == player_id) & 
             (df['team_id'] == team_id) & 
             (df['season_id'] == season_id)]
    
    if dfx.empty:
        return None
    
    # Get player name from precomputed dict
    player_name = player_names.get(player_id, "Unknown Player")
    
    # Get games this player participated in from precomputed dict
    combo_key = (player_id, team_id, season_id)
    player_games = player_games_dict.get(combo_key, set())
    
    # Initialize container for team actions
    all_team_actions = []
    
    # Process each game
    for game_id in player_games:
        # Get team actions for this game
        game_team_actions = df[(df['team_id'] == team_id) & 
                               (df['season_id'] == season_id) &
                               (df['game_id'] == game_id)]
        
        # Check if we have player timing data
        if player_game_times_dict:
            player_game_key = (player_id, game_id)
            player_periods = player_game_times_dict.get(player_game_key, [])
            
            if player_periods:
                # Process each period the player was on pitch
                for period_data in player_periods:
                    # Create a boolean mask for filtering
                    start_mask = ((game_team_actions['period_id'] == period_data['start_period']) & 
                                 (game_team_actions['time_seconds'] >= period_data['start_second']))
                    end_mask = ((game_team_actions['period_id'] == period_data['end_period']) & 
                               (game_team_actions['time_seconds'] <= period_data['end_second']))
                    
                    # Combine masks and filter
                    period_actions = game_team_actions[start_mask | end_mask]
                    
                    if not period_actions.empty:
                        all_team_actions.append(period_actions)
            else:
                # Fallback to all team actions
                all_team_actions.append(game_team_actions)
        else:
            # No timing data available, use all team actions
            all_team_actions.append(game_team_actions)
    
    # Skip if no team actions found
    if not all_team_actions:
        return None
        
    # Combine all team actions in one step
    dfy = pd.concat(all_team_actions, ignore_index=True)
    
    if dfy.empty:
        return None
    
    # Extract coordinates as numpy arrays for faster calculation
    player_x = dfx.start_x.values
    player_y = dfx.start_y.values
    team_x = dfy.start_x.values
    team_y = dfy.start_y.values
    
    # We need to recreate the pitch object here since it's not picklable
    from mplsoccer import Pitch
    pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', 
                 linewidth=1.25, line_color='#000000', line_zorder=2, pitch_color='#D7D1CF')
    
    # Calculate bin statistics directly with numpy arrays
    bin_statisticplayer = pitch.bin_statistic(player_x, player_y, statistic='count', bins=(105, 68))
    bin_statisticteam = pitch.bin_statistic(team_x, team_y, statistic='count', bins=(105, 68))
    
    # Use vectorized operations for normalization
    normalized_statistic = np.zeros_like(bin_statisticplayer['statistic'])
    mask = (bin_statisticteam['statistic'] > 0)
    normalized_statistic[mask] = bin_statisticplayer['statistic'][mask] / bin_statisticteam['statistic'][mask]
    
    # Apply Gaussian filter
    filtered_statistic = gaussian_filter(normalized_statistic, 3)
    
    return {
        'player_id': player_id,
        'player_name': player_name,
        'team_id': team_id,
        'season_id': season_id,
        'statistic': filtered_statistic
    }


def create_player_bin_statistics_df(df, player_games_df=None, n_jobs=1):
    """
    Create bin statistics for each player-team combination and 
    store them in a DataFrame - optimized for performance.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The event data containing player events
    player_games_df : pandas DataFrame
        DataFrame containing information about when players were on the pitch
    n_jobs : int, default=1
        Number of processes to use for parallel execution. Set >1 to use multiple cores.
        
    Returns:
    --------
    pandas DataFrame
        DataFrame with columns: player_id, player_name, team_id, season_id, statistic
    """
    import numpy as np
    from scipy.ndimage import gaussian_filter
    from tqdm import tqdm
    from collections import defaultdict
    import pandas as pd
    from multiprocessing import Pool
    from matplotlib.colors import LinearSegmentedColormap
    from mplsoccer import Pitch
    
    # Get unique player-team combinations
    combinations = df[['player_id', 'team_id', 'season_id']].drop_duplicates().reset_index(drop=True)
    print(f"Found {len(combinations)} unique player-team-season combinations")
    
    # Convert combinations to a simple numpy array for pickling
    combinations_array = combinations.values
    
    # Precompute lookups for faster access
    # 1. Create player name lookup
    player_names = dict(zip(df['player_id'], df['player_name']))
    
    # 2. Create player games lookup
    player_games_dict = defaultdict(set)
    for _, row in df[['player_id', 'team_id', 'season_id', 'game_id']].iterrows():
        key = (row['player_id'], row['team_id'], row['season_id'])
        player_games_dict[key].add(row['game_id'])
    
    # 3. Preindex player_games_df for faster lookup if provided
    player_game_times_dict = {}
    if player_games_df is not None:
        for _, row in player_games_df.iterrows():
            key = (row['player_id'], row['game_id'])
            if key not in player_game_times_dict:
                player_game_times_dict[key] = []
            player_game_times_dict[key].append({
                'start_period': row['start_period'],
                'start_second': row['start_second'],
                'end_period': row['end_period'],
                'end_second': row['end_second']
            })
    
    # Process combinations (parallel or sequential)
    args_list = [(i, combinations_array, df, player_names, player_games_dict, player_game_times_dict)
                for i in range(len(combinations))]
    
    if n_jobs > 1:
        # Parallel processing
        with Pool(processes=n_jobs) as pool:
            all_results = list(tqdm(
                pool.imap(process_combination, args_list),
                total=len(combinations),
                desc="Processing player combinations"
            ))
    else:
        # Sequential processing
        all_results = [process_combination(args) for args in tqdm(args_list, 
                                                          desc="Processing player combinations")]
    
    # Filter out None results
    results = [r for r in all_results if r is not None]
    
    # Create DataFrame
    result_df = pd.DataFrame(results)
    print(f"Created DataFrame with {len(result_df)} rows")
    
    return result_df

# For multi-core processing (use all available cores)
player_stats_df = create_player_bin_statistics_df(df, players)

Found 9859 unique player-team-season combinations

Processing player combinations: 100%|██████████| 9859/9859 [39:07<00:00,  4.20it/s]

Created DataFrame with 9852 rows

#Check the structure of the dataframe after adding team_name column
player_stats_df = player_stats_df.merge(fb[['team_id', 'team_name']])
player_stats_df

#Saving the table
player_stats_df.to_pickle(f"bin_statistic{season}.pkl")

	game_id	original_event_id	period_id	time_seconds	team_id	player_id	start_x	end_x	start_y	end_y	...	end_y_a0	type_name	result_name	bodypart_name	player_name	team_name	fotmob_id	season_id	competition_id	seconds_played
0	1819070	2.754291e+09	1	0.0	752	68335.0	52.185	57.435	34.000	34.204	...	33.796	pass	success	foot	Denis Odoi	Royal Antwerp	9988.0	2425.0	BEL-Jupiler Pro League	6150.0
1	1819070	NaN	1	0.5	752	106413.0	57.435	53.340	34.204	33.660	...	34.340	dribble	success	foot	Dennis Praet	Royal Antwerp	9988.0	2425.0	BEL-Jupiler Pro League	4881.0
2	1819070	2.754291e+09	1	1.0	752	106413.0	53.340	63.840	33.660	54.536	...	13.464	pass	success	foot	Dennis Praet	Royal Antwerp	9988.0	2425.0	BEL-Jupiler Pro League	4881.0
3	1819070	NaN	1	3.0	752	388097.0	63.840	54.495	54.536	58.208	...	9.792	dribble	success	foot	Jelle Bataille	Royal Antwerp	9988.0	2425.0	BEL-Jupiler Pro League	6150.0
4	1819070	2.754291e+09	1	5.0	752	388097.0	54.495	73.500	58.208	42.024	...	25.976	pass	success	foot	Jelle Bataille	Royal Antwerp	9988.0	2425.0	BEL-Jupiler Pro League	6150.0

	player_id	player_name	team_id	season_id	statistic	team_name
0	68335.0	Denis Odoi	752.0	2425.0	[[0.0417474703252548, 0.040847603349596355, 0....	Royal Antwerp
1	106413.0	Dennis Praet	752.0	2425.0	[[0.029253781215685632, 0.03454770587822273, 0...	Royal Antwerp
2	388097.0	Jelle Bataille	752.0	2425.0	[[0.09212127779509127, 0.08787376959553757, 0....	Royal Antwerp
3	69933.0	Toby Alderweireld	752.0	2425.0	[[0.018920814067782926, 0.018380303110433316, ...	Royal Antwerp
4	37502.0	Tjaronn Chery	752.0	2425.0	[[0.030456096264926267, 0.03979106265170132, 0...	Royal Antwerp
...	...	...	...	...	...	...
9847	462097.0	Sota Kitahara	5973.0	2024.0	[[0.0011733655462917167, 0.0011315123881704301...	Seattle
9848	477308.0	Adam Beaudry	1120.0	2024.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Colorado
9849	542900.0	Ervin Torres	29664.0	2024.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Austin FC
9850	328504.0	Emanuel Reynoso	9293.0	2024.0	[[0.0005024170892179867, 0.001031943622745739,...	Minnesota United
9851	512799.0	Cyprian Kachwele	11134.0	2024.0	[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...	Vancouver